# **House Price Prediction**

Dataset Link: https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/overview


NOTE: We will only use the training dataset as we need to evaluate the test set for all simulaitons.

## Loading Libraries and Dataset

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [18]:
base_directoty = "/content/drive/MyDrive/HyperLocal_Tuning/Regression_HLS/House_Price_Prediction/Dataset/"

path_train = base_directoty + "train.csv"
# path_test = "/content/drive/MyDrive/HyperLocal_Tuning/Regression_HLS/House_Price_Prediction/Dataset/test.csv"

train_df = pd.read_csv(path_train)
# test_df = pd.read_csv(path_test)

train_df.shape #, test_df.shape

(1460, 81)

## Null Value Check

Checking the Null values in all columns in training dataset. We will drop columns with missing values over 80\%.

In [19]:
import pandas as pd

def NaN_Analysis(df):

    # Calculate the number of NaN entries in each column
    nan_counts = df.isna().sum()

    # Filter columns with more than 0 NaN entries
    columns_with_nan = nan_counts[nan_counts > 0]

    # Calculate the percentage of NaN entries
    total_rows = len(df)
    nan_percentage = (columns_with_nan / total_rows) * 100

    # Create a new DataFrame with the results
    nan_summary = pd.DataFrame({
        'NaN Count': columns_with_nan,
        'NaN Percentage': nan_percentage
    })

    # Optional: reset index to have column names as a separate column
    nan_summary.reset_index(inplace=True)
    nan_summary.rename(columns={'index': 'Feature'}, inplace=True)

    # Sort the DataFrame by 'NaN Percentage' in descending order
    nan_summary_sorted = nan_summary.sort_values(by='NaN Percentage', ascending=False)

    return nan_summary_sorted

NaN_Analysis(train_df)

Unnamed: 0,Feature,NaN Count,NaN Percentage
16,PoolQC,1453,99.520548
18,MiscFeature,1406,96.30137
1,Alley,1369,93.767123
17,Fence,1179,80.753425
2,MasVnrType,872,59.726027
10,FireplaceQu,690,47.260274
0,LotFrontage,259,17.739726
11,GarageType,81,5.547945
12,GarageYrBlt,81,5.547945
13,GarageFinish,81,5.547945


In [20]:
# Dropping PoolQC, MiscFeature, Alley and Fence from datasets along with ID.
train_df = train_df.drop(['Id','PoolQC','MiscFeature','Alley','Fence'],axis = 1)
print(train_df.shape)

(1460, 76)


## Handling Numerical and Categorical Features and Null Values

In [21]:
#print(train_df.info(memory_usage='deep'),test_df.info(memory_usage='deep'))
train_dtype = train_df.dtypes
train_dtype.value_counts()

Unnamed: 0,count
object,39
int64,34
float64,3


In [22]:
# Feature Selection : Numerical and Categorical features
numeric_data     = train_df.select_dtypes(include=[np.number])
categorical_data = train_df.select_dtypes(exclude=[np.number])

# y_train      = numeric_data['SalePrice']
# numeric_data = numeric_data.drop(['SalePrice'], axis=1)

print(numeric_data.shape, categorical_data.shape)

(1460, 37) (1460, 39)


### Categorical Data: NaN Imputation and One-hot Encoding

In [23]:
from sklearn.impute import SimpleImputer

# IMPUTING SOME OF CATEGORICAL FEATURES WITH "MISSING" VALUE

def new_fill(df, cols):
    for col in cols:
        df[col].fillna((f'Missing'), inplace=True)

empty_cols = [ 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']

new_fill(categorical_data, empty_cols)

# IMPUTING REST WITH MOST FREQUENT VALUES

def impute_missing_values(df):
    imputer = SimpleImputer(strategy='most_frequent')
    df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
    return df_imputed

# Function to perform one-hot encoding
def one_hot_encode(df):
    encoded_df = pd.get_dummies(df)
    # Ensure all values are integers (0 or 1)
    encoded_df = encoded_df.astype(int)
    return encoded_df

categorical_data       = impute_missing_values(categorical_data)  # Impute missing values
final_categorical_data = one_hot_encode(categorical_data)         # One-hot encode the categorical data

NaN_Analysis(categorical_data) # NO MISSING VALUES ANYMORE

Unnamed: 0,Feature,NaN Count,NaN Percentage


### Handling Numerical Data

In [24]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split

In [25]:
def preprocess_numerical_data(X_train, X_test, target_column ):

    # Initialize the StandardScaler
    scaler = StandardScaler()

    # Extract the target variable from the training data
    y_train = X_train[target_column]
    y_test  = X_test[target_column]

    # Drop the target variable from the training data
    X_train = X_train.drop(target_column, axis=1)
    X_test  = X_test.drop(target_column, axis=1)

    # Fit the scaler on the training data and transform the training data
    X_train_scaled = scaler.fit_transform(X_train)
    # Transform the test data
    X_test_scaled = scaler.transform(X_test)

    # KNN Imputation
    imputer = KNNImputer(n_neighbors=10)

    X_train_imputed = imputer.fit_transform(X_train_scaled)
    X_test_imputed = imputer.transform(X_test_scaled)

    X_train_imputed = pd.DataFrame(X_train_imputed, columns=X_train.columns)
    X_test_imputed = pd.DataFrame(X_test_imputed, columns=X_test.columns)

    # Convert the imputed arrays back to DataFrames
    X_train_final = pd.concat([X_train_imputed.reset_index(drop=True), pd.Series(y_train.reset_index(drop=True), name=target_column)], axis=1)
    X_test_final = pd.concat([X_test_imputed.reset_index(drop=True), pd.Series(y_test.reset_index(drop=True), name=target_column)], axis=1)

    return X_train_final, X_test_final

In [26]:
categorical_data_columns = list(final_categorical_data.columns)
# Splitting the data into training and testing sets
full_dataset    = pd.concat([numeric_data, final_categorical_data], axis=1)
X_train, X_test = train_test_split(full_dataset, test_size=0.2, random_state=42)

# Remove categorical column from both train and test datesets after saving them separately
X_train_categorical =  X_train[categorical_data_columns]
X_test_categorical  =  X_test[categorical_data_columns]

X_train = X_train.drop(categorical_data_columns, axis=1)
X_test  = X_test.drop(categorical_data_columns, axis=1)

# print(X_train.shape)

X_train_numerical_processed, X_test_numerical_processed = preprocess_numerical_data(X_train, X_test, target_column='SalePrice')

X_train_final = pd.concat([X_train_numerical_processed, X_train_categorical.reset_index(drop=True)], axis=1)
X_test_final  = pd.concat([X_test_numerical_processed, X_test_categorical.reset_index(drop=True)], axis=1)

print(X_train_final.shape, X_test_final.shape)

(1168, 285) (292, 285)


In [29]:
import pickle

### SAVING THE DATA

y_df_train = X_train_final['SalePrice']
x_df_train = X_train_final.drop('SalePrice',axis=1)
y_df_test  = X_test_final['SalePrice']
x_df_test  = X_test_final.drop('SalePrice',axis=1)

x_train, x_val, y_train, y_val = train_test_split(x_df_train, y_df_train, test_size=0.2, random_state=42)

print( x_train.shape, x_val.shape, y_train.shape, y_val.shape, x_df_test.shape, y_df_test.shape  )

full_data = { "x_train": x_train, "x_val": x_val, "x_test": x_df_test, "y_train": y_train, "y_val": y_val, "y_test": y_df_test }

with open(base_directoty + 'house_price_prediction.pickle', 'wb') as handle:
    pickle.dump( full_data, handle, protocol=pickle.HIGHEST_PROTOCOL)

(934, 284) (234, 284) (934,) (234,) (292, 284) (292,)
