In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,LabelEncoder
import warnings
warnings.filterwarnings('ignore')


In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)


In [3]:
train_df = pd.read_csv('../data/raw/train.csv')
test_df = pd.read_csv('../data/raw/test.csv')

y_train = train_df['SalePrice'].copy()
test_ids = test_df['Id'].copy()

train_df.drop(['Id','SalePrice'], axis=1, inplace=True)
test_df.drop(['Id'], axis=1, inplace=True)

n_train = train_df.shape[0]
all_data = pd.concat([train_df, test_df], axis=0).reset_index(drop=True)

print("All data shape: ", all_data.shape)

print("\n" + "=" * 80)
print("MISSING VALUES ANALYSIS")
print("=" * 80)

missing = all_data.isnull().sum()
missing = missing[missing > 0].sort_values(ascending=False)
missing_percent = (missing / len(all_data) * 100).round(2)
missing_df = pd.DataFrame({
    'Missing_Count': missing,
    'Percentage': missing_percent
})

print(f"\nFeatures with missing values: {len(missing_df)}")
print("\nTop 20 features with missing values:")
print(missing_df.head(20))



All data shape:  (2919, 79)

MISSING VALUES ANALYSIS

Features with missing values: 34

Top 20 features with missing values:
              Missing_Count  Percentage
PoolQC                 2909       99.66
MiscFeature            2814       96.40
Alley                  2721       93.22
Fence                  2348       80.44
MasVnrType             1766       60.50
FireplaceQu            1420       48.65
LotFrontage             486       16.65
GarageQual              159        5.45
GarageYrBlt             159        5.45
GarageCond              159        5.45
GarageFinish            159        5.45
GarageType              157        5.38
BsmtExposure             82        2.81
BsmtCond                 82        2.81
BsmtQual                 81        2.77
BsmtFinType2             80        2.74
BsmtFinType1             79        2.71
MasVnrArea               23        0.79
MSZoning                  4        0.14
BsmtFullBath              2        0.07


In [4]:
print("Handling missing values...")

none_features = [
    'PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu',
    'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
    'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
    'MasVnrType'
]

for feature in none_features:
    if feature in all_data.columns:
        all_data[feature] = all_data[feature].fillna('None')

zero_features = [   
    'GarageYrBlt', 'GarageArea', 'GarageCars',
    'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
    'BsmtFullBath', 'BsmtHalfBath',
    'MasVnrArea'
]

for feature in zero_features:
    if feature in all_data.columns:
        all_data[feature] = all_data[feature].fillna(0)


if 'LotFrontage' in all_data.columns:
    all_data['LotFrontage'] = all_data.groupby('Neighborhood')['LotFrontage'].transform(
        lambda x: x.fillna(x.median())
    )

if 'MSZoning' in all_data.columns:
    all_data['MSZoning'] = all_data['MSZoning'].fillna(all_data['MSZoning'].mode()[0])


if 'Utilities' in all_data.columns:
    all_data['Utilities'] = all_data['Utilities'].fillna(all_data['Utilities'].mode()[0])


if 'Functional' in all_data.columns:
    all_data['Functional'] = all_data['Functional'].fillna(all_data['Functional'].mode()[0])

if 'Exterior1st' in all_data.columns:
    all_data['Exterior1st'] = all_data['Exterior1st'].fillna(all_data['Exterior1st'].mode()[0])

if 'Exterior2nd' in all_data.columns:
    all_data['Exterior2nd'] = all_data['Exterior2nd'].fillna(all_data['Exterior2nd'].mode()[0])


if 'KitchenQual' in all_data.columns:
    all_data['KitchenQual'] = all_data['KitchenQual'].fillna(all_data['KitchenQual'].mode()[0])


if 'Electrical' in all_data.columns:
    all_data['Electrical'] = all_data['Electrical'].fillna(all_data['Electrical'].mode()[0])


if 'SaleType' in all_data.columns:
    all_data['SaleType'] = all_data['SaleType'].fillna(all_data['SaleType'].mode()[0])

remaining_missing = all_data.isnull().sum().sum()

print(f"Total remaining missing values after imputation: {remaining_missing}")

if remaining_missing > 0:
    print("\nFeatures still with missing values:")
    print(all_data.isnull().sum()[all_data.isnull().sum() > 0])





Handling missing values...
Total remaining missing values after imputation: 0


In [5]:
print('Featuring Engineering')
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']

all_data['TotalBath'] = (all_data['FullBath'] + 
                          all_data['HalfBath'] * 0.5 + 
                          all_data['BsmtFullBath'] + 
                          all_data['BsmtHalfBath'] * 0.5)

all_data['TotalPorchSF'] = (all_data['OpenPorchSF'] +
                            all_data['EnclosedPorch'] +
                            all_data['3SsnPorch'] +
                            all_data['ScreenPorch']+
                            all_data['WoodDeckSF']
                            )

all_data['HouseAge'] = all_data['YrSold'] - all_data['YearBuilt']
all_data['RemodAge'] = all_data['YrSold'] - all_data['YearRemodAdd']
all_data['IsRemod'] = (all_data['YearBuilt'] != all_data['YearRemodAdd']).astype(int)

all_data['Has2ndFloor'] = (all_data['2ndFlrSF'] > 0).astype(int)
all_data['HasGarage'] = (all_data['GarageArea'] > 0).astype(int)
all_data['HasBasement'] = (all_data['TotalBsmtSF'] > 0).astype(int)
all_data['HasPorch'] = (all_data['TotalPorchSF'] > 0).astype(int)
all_data['HasPool'] = (all_data['PoolArea'] > 0).astype(int)
all_data['HasFireplace'] = (all_data['Fireplaces'] > 0).astype(int)
all_data['OverallQualCond'] = all_data['OverallQual'] * all_data['OverallCond']
all_data['LivAreaPerRoom'] = all_data['GrLivArea'] / (all_data['TotRmsAbvGrd'] + 1)

print(f"New Data shape after feature engineering: {all_data.shape}")


                            


Featuring Engineering
New Data shape after feature engineering: (2919, 93)


In [6]:
print('Droppig unnecessary features')

drop_features = [
    'Utilities', 'Street', 'PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'YearBuilt', 'YearRemodAdd', 'GarageYrBlt',
      'MoSold'
]

drop_features = [f for f in drop_features if f in all_data.columns]
all_data = all_data.drop(drop_features, axis=1)

print(f"Dropped {len(drop_features)} features")
print(f"Features dropped: {drop_features}")
print(f"New data shape: {all_data.shape}")



Droppig unnecessary features
Dropped 11 features
Features dropped: ['Utilities', 'Street', 'PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'MoSold']
New data shape: (2919, 82)


In [None]:
print('Handling skewed features')

# First check for infinite values
numeric_feats = all_data.select_dtypes(include=[np.number]).columns.tolist()
inf_check = np.isinf(all_data[numeric_feats]).sum().sum()
if inf_check > 0:
    print(f"Found {inf_check} infinite values before transformation")
    all_data = all_data.replace([np.inf, -np.inf], np.nan)

# Calculate skewness
skewness = all_data[numeric_feats].apply(lambda x: x.skew()).sort_values(ascending=False)
high_skew = skewness[abs(skewness) > 0.5]

print(f"\nFeatures with high skewness (>0.5): {len(high_skew)}")
print("\nTop 10 most skewed features:")
print(high_skew.head(10))

# Apply log transformation with safety checks
for feat in high_skew.index:
    if feat in all_data.columns:
        # Add a small constant to avoid log(0)
        min_val = all_data[feat].min()
        if min_val <= 0:
            offset = abs(min_val) + 1
            all_data[feat] = np.log1p(all_data[feat] + offset)
        else:
            all_data[feat] = np.log1p(all_data[feat])

# Check for any remaining infinite values
inf_check = np.isinf(all_data[numeric_feats]).sum().sum()
if inf_check > 0:
    print(f"\nWarning: Found {inf_check} infinite values after transformation")
    # Replace any remaining infinities with NaN
    all_data = all_data.replace([np.inf, -np.inf], np.nan)
    # Fill NaN with median of the column
    all_data = all_data.fillna(all_data.median())

# Transform target variable
y_train_log = np.log1p(y_train)

Handlig skewed features


In [12]:
print('Encoding categorical features')

categorical_feats = all_data.select_dtypes(include=['object']).columns.tolist()

ordinal_mappings = {
    'ExterQual': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5, 'None': 0},
    'ExterCond': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5, 'None': 0},
    'BsmtQual': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5, 'None': 0},
    'BsmtCond': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5, 'None': 0},
    'HeatingQC': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5, 'None': 0},
    'KitchenQual': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5, 'None': 0},
    'GarageQual': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5, 'None': 0},
    'GarageCond': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5, 'None': 0},
    'BsmtExposure': {'None': 0, 'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4},
    'BsmtFinType1': {'None': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6},
    'BsmtFinType2': {'None': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6},
    'Functional': {'Sal': 1, 'Sev': 2, 'Maj2': 3, 'Maj1': 4, 'Mod': 5, 'Min2': 6, 'Min1': 7, 'Typ': 8},
    'GarageFinish': {'None': 0, 'Unf': 1, 'RFn': 2, 'Fin': 3},
    'PavedDrive': {'N': 0, 'P': 1, 'Y': 2},
    'LotShape': {'IR3': 1, 'IR2': 2, 'IR1': 3, 'Reg': 4},
    'LandContour': {'Low': 1, 'HLS': 2, 'Bnk': 3, 'Lvl': 4},
    'LandSlope': {'Sev': 1, 'Mod': 2, 'Gtl': 3}
}

for feat, mapping in ordinal_mappings.items():
    if feat in all_data.columns:
        all_data[feat] = all_data[feat].map(mapping)

print(f"Ordinal encoded {len(ordinal_mappings)} features")

remaining_categorical = all_data.select_dtypes(include=['object']).columns.tolist()

if len(remaining_categorical) > 0:
    print(f"\nOne-hot encoding {len(remaining_categorical)} remaining categorical features...")
    all_data = pd.get_dummies(all_data, columns=remaining_categorical, drop_first=True)
    print(f"One-hot encoding complete")



Encoding categorical features
Ordinal encoded 17 features


In [13]:
print('Train test split')

X_train = all_data.iloc[:n_train, :].copy()
X_test = all_data.iloc[n_train:, :].copy()

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train_log.shape}")



Train test split
X_train shape: (1460, 199)
X_test shape: (1459, 199)
y_train shape: (1460,)


In [14]:
print('Handling outliers')

outlier_indices = X_train[(X_train['GrLivArea'] > 4000) & (y_train < 300000)].index

if len(outlier_indices) > 0:
    print(f"Removing {len(outlier_indices)} outliers from training data")
    X_train = X_train.drop(outlier_indices)
    y_train_log = y_train_log.drop(outlier_indices)
    print(f"Outliers removed")
else:
    print("No significant outliers detected")



  



 
  


Handling outliers
No significant outliers detected


In [15]:
print('Feature scaling')

# First check for any problematic values
def check_data_issues(data, name):
    inf_count = np.isinf(data.select_dtypes(include=[np.number])).sum().sum()
    nan_count = data.isnull().sum().sum()
    print(f"\nChecking {name}:")
    if inf_count > 0:
        print(f"Found {inf_count} infinite values")
    if nan_count > 0:
        print(f"Found {nan_count} NaN values")
    return inf_count == 0 and nan_count == 0

# Check and clean training data
check_data_issues(X_train, "Training data")
X_train = X_train.replace([np.inf, -np.inf], np.nan)
X_train = X_train.fillna(X_train.median())

# Check and clean test data
check_data_issues(X_test, "Test data")
X_test = X_test.replace([np.inf, -np.inf], np.nan)
X_test = X_test.fillna(X_test.median())

# Apply scaling
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(
    scaler.fit_transform(X_train),
    columns=X_train.columns,
    index=X_train.index
)
X_test_scaled = pd.DataFrame(
    scaler.transform(X_test),
    columns=X_test.columns,
    index=X_test.index
)

# Verify no issues after scaling
print("\nAfter scaling:")
check_data_issues(X_train_scaled, "Scaled training data")
check_data_issues(X_test_scaled, "Scaled test data")

Feature scaling

Checking Training data:
Found 24820 NaN values

Checking Test data:
Found 1 infinite values
Found 24803 NaN values

After scaling:

Checking Scaled training data:
Found 24820 NaN values

Checking Scaled test data:
Found 24803 NaN values


np.False_

In [16]:
import os 

os.makedirs('../data/processed', exist_ok=True)
X_train_scaled.to_csv('../data/processed/X_train_processed.csv', index=False)
X_test_scaled.to_csv('../data/processed/X_test_processed.csv', index=False)
y_train_log.to_csv('../data/processed/y_train_log.csv', index=False, header=['SalePrice_log'])
test_ids.to_csv('../data/processed/test_ids.csv', index=False, header=['Id'])