In [26]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from scipy.stats import skew

In [27]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [28]:
# Fill missing categorical values with 'None'
for col in ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu']:
    train[col].fillna('None', inplace=True)
    test[col].fillna('None', inplace=True)

# Fill numerical columns with median
for col in ['LotFrontage', 'GarageYrBlt']:
    train[col].fillna(train[col].median(), inplace=True)
    test[col].fillna(test[col].median(), inplace=True)

# Fill remaining with mode
for col in train.columns:
    if train[col].isnull().sum() > 0:
        train[col].fillna(train[col].mode()[0], inplace=True)
for col in test.columns:
    if test[col].isnull().sum() > 0:
        test[col].fillna(test[col].mode()[0], inplace=True)


In [29]:
train['TotalSF'] = train['TotalBsmtSF'] + train['1stFlrSF'] + train['2ndFlrSF']
test['TotalSF'] = test['TotalBsmtSF'] + test['1stFlrSF'] + test['2ndFlrSF']

train['Age'] = train['YrSold'] - train['YearBuilt']
test['Age'] = test['YrSold'] - test['YearBuilt']


In [30]:
# Get numeric features
numeric_feats = train.dtypes[train.dtypes != "object"].index

# Drop 'SalePrice' from skewed feature analysis
numeric_feats = numeric_feats.drop("SalePrice")

# Find skewed features
skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
skewed_features = skewed_feats[abs(skewed_feats) > 0.75].index

# Apply log1p to skewed features only
for feat in skewed_features:
    train[feat] = np.log1p(train[feat])
    test[feat] = np.log1p(test[feat])


In [31]:
train['TotalSF'] = train['TotalBsmtSF'] + train['1stFlrSF'] + train['2ndFlrSF']
test['TotalSF'] = test['TotalBsmtSF'] + test['1stFlrSF'] + test['2ndFlrSF']

train['Age'] = train['YrSold'] - train['YearBuilt']
test['Age'] = test['YrSold'] - test['YearBuilt']


In [33]:
import numpy as np
from scipy.stats import skew

# Get numeric features excluding 'SalePrice'
numeric_feats = train.dtypes[train.dtypes != "object"].index
numeric_feats = numeric_feats.drop('SalePrice')  # drop target variable

# Compute skewness
skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)

# Filter features with skew > 0.75
skewed_features = skewed_feats[abs(skewed_feats) > 0.75].index

# Apply log1p to both train and test for skewed features
for feat in skewed_features:
    train[feat] = np.log1p(train[feat])
    test[feat] = np.log1p(test[feat])


In [34]:
# Label Encode some ordinal columns
label_cols = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual']
for col in label_cols:
    lbl = LabelEncoder()
    train[col] = lbl.fit_transform(train[col].astype(str))
    test[col] = lbl.transform(test[col].astype(str))

# One-hot encode remaining categorical features
train = pd.get_dummies(train)
test = pd.get_dummies(test)

# Align train and test to have same columns
train, test = train.align(test, join='left', axis=1)


In [35]:
print(train)

        Id  MSSubClass  LotFrontage   LotArea  OverallQual  OverallCond  \
0        1    4.110874     1.646667  9.042040            7            5   
1        2    3.044522     1.685370  9.169623            6            8   
2        3    4.110874     1.655196  9.328212            7            5   
3        4    4.262680     1.631370  9.164401            7            5   
4        5    4.110874     1.694266  9.565284            8            5   
...    ...         ...          ...       ...          ...          ...   
1455  1456    4.110874     1.637663  8.976894            6            5   
1456  1457    3.044522     1.696413  9.486152            6            6   
1457  1458    4.262680     1.649561  9.109746            7            9   
1458  1459    3.044522     1.655196  9.181735            5            6   
1459  1460    3.044522     1.673489  9.204121            5            6   

      YearBuilt  YearRemodAdd  MasVnrArea  ExterQual  ...  SaleType_ConLw  \
0          2003       

In [36]:
print(test)

        Id  MSSubClass  LotFrontage   LotArea  OverallQual  OverallCond  \
0     1461    3.044522     1.685370  9.360741            5            6   
1     1462    3.044522     1.687642  9.565775            6            6   
2     1463    4.110874     1.671001  9.534668            5            5   
3     1464    4.110874     1.680725  9.208238            6            6   
4     1465    4.795791     1.565317  8.518392            8            5   
...    ...         ...          ...       ...          ...          ...   
1454  2915    5.081404     1.408800  7.568896            4            7   
1455  2916    5.081404     1.408800  7.546974            4            5   
1456  2917    3.044522     1.805236  9.903538            5            7   
1457  2918    4.454347     1.637663  9.253591            5            5   
1458  2919    4.110874     1.671001  9.172431            7            5   

      YearBuilt  YearRemodAdd  MasVnrArea  ExterQual  ...  SaleType_ConLw  \
0          1961       