In [2]:
import pandas as pd

# Load the datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Display the first few rows of the train dataset
print(train_df.head())


   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondition  SalePrice  
0   2008        WD   

In [6]:
# Fill missing values
# For categorical features, we can fill missing values with 'None' or the most frequent value
train_df['Alley'].fillna('NA', inplace=True)
train_df['BsmtQual'].fillna('NA', inplace=True)
train_df['BsmtCond'].fillna('NA', inplace=True)
train_df['BsmtExposure'].fillna('NA', inplace=True)
train_df['BsmtFinType1'].fillna('NA', inplace=True)
train_df['BsmtFinType2'].fillna('NA', inplace=True)
train_df['FireplaceQu'].fillna('NA', inplace=True)
train_df['GarageType'].fillna('NA', inplace=True)
train_df['GarageFinish'].fillna('NA', inplace=True)
train_df['GarageQual'].fillna('NA', inplace=True)
train_df['GarageCond'].fillna('NA', inplace=True)
train_df['PoolQC'].fillna('NA', inplace=True)
train_df['Fence'].fillna('NA', inplace=True)
train_df['MiscFeature'].fillna('NA', inplace=True)

# For numerical features, we can fill missing values with the median or mean
train_df['LotFrontage'].fillna(train_df['LotFrontage'].median(), inplace=True)
train_df['MasVnrArea'].fillna(0, inplace=True)
train_df['GarageYrBlt'].fillna(train_df['YearBuilt'], inplace=True)

# For other features with very few missing values, we can drop rows
train_df.dropna(subset=['Electrical'], inplace=True)

# Verify that there are no more missing values
train_df.isnull().sum().sum()


871

In [8]:
# Fill missing values in the training data
train_df['Alley'].fillna('NA', inplace=True)
train_df['BsmtQual'].fillna('NA', inplace=True)
train_df['BsmtCond'].fillna('NA', inplace=True)
train_df['BsmtExposure'].fillna('NA', inplace=True)
train_df['BsmtFinType1'].fillna('NA', inplace=True)
train_df['BsmtFinType2'].fillna('NA', inplace=True)
train_df['FireplaceQu'].fillna('NA', inplace=True)
train_df['GarageType'].fillna('NA', inplace=True)
train_df['GarageFinish'].fillna('NA', inplace=True)
train_df['GarageQual'].fillna('NA', inplace=True)
train_df['GarageCond'].fillna('NA', inplace=True)
train_df['PoolQC'].fillna('NA', inplace=True)
train_df['Fence'].fillna('NA', inplace=True)
train_df['MiscFeature'].fillna('NA', inplace=True)
train_df['LotFrontage'].fillna(train_df['LotFrontage'].median(), inplace=True)
train_df['MasVnrArea'].fillna(0, inplace=True)
train_df['GarageYrBlt'].fillna(train_df['YearBuilt'], inplace=True)
train_df.dropna(subset=['Electrical'], inplace=True)

# Handle missing values in the test data in the same way
test_df['Alley'].fillna('NA', inplace=True)
test_df['BsmtQual'].fillna('NA', inplace=True)
test_df['BsmtCond'].fillna('NA', inplace=True)
test_df['BsmtExposure'].fillna('NA', inplace=True)
test_df['BsmtFinType1'].fillna('NA', inplace=True)
test_df['BsmtFinType2'].fillna('NA', inplace=True)
test_df['FireplaceQu'].fillna('NA', inplace=True)
test_df['GarageType'].fillna('NA', inplace=True)
test_df['GarageFinish'].fillna('NA', inplace=True)
test_df['GarageQual'].fillna('NA', inplace=True)
test_df['GarageCond'].fillna('NA', inplace=True)
test_df['PoolQC'].fillna('NA', inplace=True)
test_df['Fence'].fillna('NA', inplace=True)
test_df['MiscFeature'].fillna('NA', inplace=True)
test_df['LotFrontage'].fillna(test_df['LotFrontage'].median(), inplace=True)
test_df['MasVnrArea'].fillna(0, inplace=True)
test_df['GarageYrBlt'].fillna(test_df['YearBuilt'], inplace=True)

# Verify that there are no more missing values
print("Missing values in train data:", train_df.isnull().sum().sum())
print("Missing values in test data:", test_df.isnull().sum().sum())


Missing values in train data: 0
Missing values in test data: 916


In [9]:
from sklearn.preprocessing import LabelEncoder

# Create a list of categorical columns
categorical_cols = train_df.select_dtypes(include=['object']).columns

# Initialize a dictionary to store label encoders
label_encoders = {}

# Apply Label Encoding to categorical features in train data
for col in categorical_cols:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    label_encoders[col] = le

# Apply the same transformations to the test data
for col in categorical_cols:
    test_df[col] = label_encoders[col].transform(test_df[col])

# Verify encoding
train_df.head()


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,3,65.0,8450,1,1,3,3,0,...,0,3,4,1,0,2,2008,8,4,208500
1,2,20,3,80.0,9600,1,1,3,3,0,...,0,3,4,1,0,5,2007,8,4,181500
2,3,60,3,68.0,11250,1,1,0,3,0,...,0,3,4,1,0,9,2008,8,4,223500
3,4,70,3,60.0,9550,1,1,0,3,0,...,0,3,4,1,0,2,2006,8,0,140000
4,5,60,3,84.0,14260,1,1,0,3,0,...,0,3,4,1,0,12,2008,8,4,250000


In [10]:
# Example feature engineering
# Create a new feature for the age of the house
train_df['HouseAge'] = train_df['YrSold'] - train_df['YearBuilt']
test_df['HouseAge'] = test_df['YrSold'] - test_df['YearBuilt']

# Create a new feature for the time since last remodel
train_df['RemodAge'] = train_df['YrSold'] - train_df['YearRemodAdd']
test_df['RemodAge'] = test_df['YrSold'] - test_df['YearRemodAdd']

# Total number of bathrooms
train_df['TotalBath'] = (train_df['FullBath'] + 0.5 * train_df['HalfBath'] +
                         train_df['BsmtFullBath'] + 0.5 * train_df['BsmtHalfBath'])
test_df['TotalBath'] = (test_df['FullBath'] + 0.5 * test_df['HalfBath'] +
                        test_df['BsmtFullBath'] + 0.5 * test_df['BsmtHalfBath'])

# Verify new features
train_df[['HouseAge', 'RemodAge', 'TotalBath']].head()


Unnamed: 0,HouseAge,RemodAge,TotalBath
0,5,5,3.5
1,31,31,2.5
2,7,6,3.5
3,91,36,2.0
4,8,8,3.5


In [13]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Feature Engineering
train_df['HouseAge'] = train_df['YrSold'] - train_df['YearBuilt']
test_df['HouseAge'] = test_df['YrSold'] - test_df['YearBuilt']
train_df['RemodAge'] = train_df['YrSold'] - train_df['YearRemodAdd']
test_df['RemodAge'] = test_df['YrSold'] - test_df['YearRemodAdd']
train_df['TotalBath'] = (train_df['FullBath'] + 0.5 * train_df['HalfBath'] +
                         train_df['BsmtFullBath'] + 0.5 * train_df['BsmtHalfBath'])
test_df['TotalBath'] = (test_df['FullBath'] + 0.5 * test_df['HalfBath'] +
                        test_df['BsmtFullBath'] + 0.5 * test_df['BsmtHalfBath'])

# Exclude 'SalePrice' from the numerical columns to be scaled
numerical_cols = train_df.select_dtypes(include=['int64', 'float64']).columns
numerical_cols = numerical_cols.drop('SalePrice')

# Scaling
scaler = StandardScaler()
train_df[numerical_cols] = scaler.fit_transform(train_df[numerical_cols])
test_df[numerical_cols] = scaler.transform(test_df[numerical_cols])

# Splitting Data
X = train_df.drop(columns=['SalePrice'])
y = train_df['SalePrice']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Verify the split
X_train.shape, X_val.shape, y_train.shape, y_val.shape


((1167, 83), (292, 83), (1167,), (292,))