In [316]:
import pandas as pd
import numpy as np
from skimpy import skim

In [317]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

In [318]:
train.drop(columns=['Id','Neighborhood','MiscFeature','PoolQC','MasVnrType','Alley','Fence','FireplaceQu'],inplace=True)
test.drop(columns=['Id','Neighborhood','MiscFeature','PoolQC','MasVnrType','Alley','Fence','FireplaceQu'],inplace=True)

## Imputing Values for `Numerical` Columns

In [319]:
mean=train['LotFrontage'].mean()
train['LotFrontage']=train['LotFrontage'].fillna(mean)
mean=test['LotFrontage'].mean()
test['LotFrontage']=test['LotFrontage'].fillna(mean)

In [320]:
mean=train['GarageYrBlt'].mean()
train['GarageYrBlt']=train['GarageYrBlt'].fillna(mean)
mean=test['GarageYrBlt'].mean()
test['GarageYrBlt']=test['GarageYrBlt'].fillna(mean)

In [321]:
mean=train['MasVnrArea'].mean()
train['MasVnrArea']=train['MasVnrArea'].fillna(mean)
mean=test['MasVnrArea'].mean()
test['MasVnrArea']=test['MasVnrArea'].fillna(mean)

## Imputing Values for `Categorical Columns`

In [322]:
mode=train['GarageCond'].mode()[0]
train['GarageCond']=train['GarageCond'].fillna(mode)
test['GarageCond']=test['GarageCond'].fillna(mode)

mode=train['GarageType'].mode()[0]
train['GarageType']=train['GarageType'].fillna(mode)
test['GarageType']=test['GarageType'].fillna(mode)

mode=train['GarageFinish'].mode()[0]
train['GarageFinish']=train['GarageFinish'].fillna(mode)
test['GarageFinish']=test['GarageFinish'].fillna(mode)

mode=train['GarageQual'].mode()[0]
train['GarageQual']=train['GarageQual'].fillna(mode)
test['GarageQual']=test['GarageQual'].fillna(mode)

mode=train['Electrical'].mode()[0]
train['Electrical']=train['Electrical'].fillna(mode)
test['Electrical']=test['Electrical'].fillna(mode)

mode=train['BsmtQual'].mode()[0]
train['BsmtQual']=train['BsmtQual'].fillna(mode)
test['BsmtQual']=test['BsmtQual'].fillna(mode)

mode=train['BsmtCond'].mode()[0]
train['BsmtCond']=train['BsmtCond'].fillna(mode)
test['BsmtCond']=test['BsmtCond'].fillna(mode)

mode=train['BsmtExposure'].mode()[0]
train['BsmtExposure']=train['BsmtExposure'].fillna(mode)
test['BsmtExposure']=test['BsmtExposure'].fillna(mode)

mode=train['BsmtFinType1'].mode()[0]
train['BsmtFinType1']=train['BsmtFinType1'].fillna(mode)
test['BsmtFinType1']=test['BsmtFinType1'].fillna(mode)

mode=train['BsmtFinType2'].mode()[0]
train['BsmtFinType2']=train['BsmtFinType2'].fillna(mode)
test['BsmtFinType2']=test['BsmtFinType2'].fillna(mode)

### we are replacing the less frequent values in each column with word `'other'`

In [323]:
def replace_less_frequent(dataframe, threshold):
    categorical_columns = dataframe.select_dtypes(include=['object', 'category']).columns
    
    for column in categorical_columns:
        value_counts = dataframe[column].value_counts()
        to_replace = value_counts[value_counts < threshold].index
        dataframe[column] = dataframe[column].replace(to_replace, 'other')

In [324]:
replace_less_frequent(train, 150)
replace_less_frequent(test, 150)

columns_to_encode = [
    'MSZoning', 'Street', 'LotShape',
    'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 
    'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 
    'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 
    'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 
    'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 
    'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 
    'KitchenQual', 'Functional', 'GarageType', 'GarageFinish', 
    'GarageQual', 'GarageCond', 'PavedDrive', 'SaleType', 
    'SaleCondition'
]
train_ohe = pd.get_dummies(train, columns=columns_to_encode, drop_first=True)

# One-hot encode the test DataFrame
test_ohe = pd.get_dummies(test, columns=columns_to_encode, drop_first=True)

# Reindex the test DataFrame to ensure it has the same columns as the train DataFrame
test_ohe = test_ohe.reindex(columns=train_ohe.columns, fill_value=0)

In [325]:
numerical_cols = train.select_dtypes(include=['int', 'float']).columns
numerical_data = train[numerical_cols]
df_final_train = pd.concat([numerical_data, train_ohe], axis=1)

In [326]:
numerical_cols = test.select_dtypes(include=['int', 'float']).columns
numerical_data = test[numerical_cols]
df_final_test = pd.concat([numerical_data, test_ohe], axis=1)
df_final_test.shape

(1459, 132)

In [None]:
df_final_test

In [330]:
df_final_test.to_csv('processed_test.csv',index=False)
df_final_train.to_csv('processed_train.csv',index=False)

In [327]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_test_scaled = pd.DataFrame(scaler.fit_transform(df_final_test), columns=df_final_test.columns)
df_test_scaled

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,Functional_other,GarageType_Detchd,GarageType_other,GarageFinish_RFn,GarageFinish_Unf,GarageQual_other,GarageCond_other,PavedDrive_other,SaleType_other,SaleCondition_other
0,-0.874711,0.555587,0.363929,-0.751101,0.400766,-0.340945,-1.072885,-0.570108,0.063273,0.517171,...,-0.271263,-0.606123,-0.323212,-0.602952,1.037011,-0.253351,-0.194154,-0.34849,-0.398568,-0.460211
1,-0.874711,0.604239,0.897861,-0.054877,0.400766,-0.439695,-1.214908,0.041273,1.063027,-0.297800,...,-0.271263,-0.606123,-0.323212,-0.602952,1.037011,-0.253351,-0.194154,-0.34849,-0.398568,-0.460211
2,0.061351,0.263676,0.809646,-0.751101,-0.497418,0.844059,0.678742,-0.570108,0.772989,-0.297800,...,-0.271263,-0.606123,-0.323212,-0.602952,-0.964310,-0.253351,-0.194154,-0.34849,-0.398568,-0.460211
3,0.061351,0.458284,0.032064,-0.054877,0.400766,0.876976,0.678742,-0.456889,0.357706,-0.297800,...,-0.271263,-0.606123,-0.323212,-0.602952,-0.964310,-0.253351,-0.194154,-0.34849,-0.398568,-0.460211
4,1.465443,-1.244533,-0.971808,1.337571,-0.497418,0.679475,0.394694,-0.570108,-0.387166,-0.297800,...,-0.271263,-0.606123,-0.323212,1.658506,-0.964310,-0.253351,-0.194154,-0.34849,-0.398568,-0.460211
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2.401505,-2.314875,-1.591330,-1.447325,1.298950,-0.044694,-0.646813,-0.570108,-0.965046,-0.297800,...,-0.271263,-0.606123,-0.323212,-0.602952,1.037011,-0.253351,-0.194154,-0.34849,-0.398568,-0.460211
1455,2.401505,-2.314875,-1.599808,-1.447325,-0.497418,-0.044694,-0.646813,-0.570108,-0.411336,-0.297800,...,-0.271263,-0.606123,3.093940,-0.602952,1.037011,-0.253351,-0.194154,-0.34849,-0.398568,2.172917
1456,-0.874711,4.447740,2.055150,-0.751101,1.298950,-0.373861,0.584059,-0.570108,1.724403,-0.297800,...,-0.271263,1.649830,-0.323212,-0.602952,1.037011,-0.253351,-0.194154,-0.34849,-0.398568,2.172917
1457,0.646389,-0.320147,0.125527,-0.751101,-0.497418,0.679475,0.394694,-0.570108,-0.224568,-0.297800,...,-0.271263,-0.606123,-0.323212,-0.602952,1.037011,-0.253351,-0.194154,-0.34849,-0.398568,-0.460211


In [328]:
scaler = StandardScaler()
df_train_scaled = pd.DataFrame(scaler.fit_transform(df_final_train), columns=df_final_train.columns)
df_train_scaled

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,Functional_other,GarageType_Detchd,GarageType_other,GarageFinish_RFn,GarageFinish_Unf,GarageQual_other,GarageCond_other,PavedDrive_other,SaleType_other,SaleCondition_other
0,0.073375,-0.229372,-0.207142,0.651479,-0.517200,1.050994,0.878668,0.511418,0.575425,-0.288653,...,-0.271163,-0.600559,-0.301962,1.568348,-0.941438,-0.221022,-0.194085,-0.299253,-0.390293,-0.467651
1,-0.872563,0.451936,-0.091886,-0.071836,2.179628,0.156734,-0.429577,-0.574410,1.171992,-0.288653,...,-0.271163,-0.600559,-0.301962,1.568348,-0.941438,-0.221022,-0.194085,-0.299253,-0.390293,-0.467651
2,0.073375,-0.093110,0.073480,0.651479,-0.517200,0.984752,0.830215,0.323060,0.092907,-0.288653,...,-0.271163,-0.600559,-0.301962,1.568348,-0.941438,-0.221022,-0.194085,-0.299253,-0.390293,-0.467651
3,0.309859,-0.456474,-0.096897,0.651479,-0.517200,-1.863632,-0.720298,-0.574410,-0.499274,-0.288653,...,-0.271163,1.665116,-0.301962,-0.637614,1.062205,-0.221022,-0.194085,-0.299253,-0.390293,2.138345
4,0.073375,0.633618,0.375148,1.374795,-0.517200,0.951632,0.733308,1.364570,0.463568,-0.288653,...,-0.271163,-0.600559,-0.301962,1.568348,-0.941438,-0.221022,-0.194085,-0.299253,-0.390293,-0.467651
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,0.073375,-0.365633,-0.260560,-0.071836,-0.517200,0.918511,0.733308,-0.574410,-0.973018,-0.288653,...,-0.271163,-0.600559,-0.301962,1.568348,-0.941438,-0.221022,-0.194085,-0.299253,-0.390293,-0.467651
1456,-0.872563,0.679039,0.266407,-0.071836,0.381743,0.222975,0.151865,0.084843,0.759659,0.722112,...,3.687818,-0.600559,-0.301962,-0.637614,1.062205,-0.221022,-0.194085,-0.299253,-0.390293,-0.467651
1457,0.309859,-0.183951,-0.147810,0.651479,3.078570,-1.002492,1.024029,-0.574410,-0.369871,-0.288653,...,-0.271163,-0.600559,-0.301962,1.568348,-0.941438,-0.221022,-0.194085,-0.299253,-0.390293,-0.467651
1458,-0.872563,-0.093110,-0.080160,-0.795151,0.381743,-0.704406,0.539493,-0.574410,-0.865548,6.092188,...,-0.271163,-0.600559,-0.301962,-0.637614,1.062205,-0.221022,-0.194085,-0.299253,-0.390293,-0.467651
