In [15]:
import pandas as pd

# Load the training and test datasets
train_data_path = 'train.csv'
test_data_path = 'test.csv'

train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

# Display the first few rows of the training data for a quick overview
train_data.head()


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [16]:
# Function to calculate missing values by column
def missing_values_table(df):
    # Total missing values
    mis_val = df.isnull().sum()
    
    # Percentage of missing values
    mis_val_percent = 100 * df.isnull().sum() / len(df)
    
    # Make a table with the results
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
    
    # Rename the columns
    mis_val_table_ren_columns = mis_val_table.rename(
    columns = {0 : 'Missing Values', 1 : '% of Total Values'})
    
    # Sort the table by percentage of missing descending
    mis_val_table_ren_columns = mis_val_table_ren_columns[
        mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
    '% of Total Values', ascending=False).round(1)
    
    return mis_val_table_ren_columns

# Missing values in training data
train_missing = missing_values_table(train_data)
# Missing values in test data
test_missing = missing_values_table(test_data)

train_missing, test_missing


(              Missing Values  % of Total Values
 PoolQC                  1453               99.5
 MiscFeature             1406               96.3
 Alley                   1369               93.8
 Fence                   1179               80.8
 MasVnrType               872               59.7
 FireplaceQu              690               47.3
 LotFrontage              259               17.7
 GarageType                81                5.5
 GarageYrBlt               81                5.5
 GarageFinish              81                5.5
 GarageQual                81                5.5
 GarageCond                81                5.5
 BsmtFinType2              38                2.6
 BsmtExposure              38                2.6
 BsmtFinType1              37                2.5
 BsmtCond                  37                2.5
 BsmtQual                  37                2.5
 MasVnrArea                 8                0.5
 Electrical                 1                0.1,
               Miss

In [17]:
import pandas as pd

# Define a function to fill missing values with 'None' or 0, as appropriate
def fill_missing_values(df):
    # Fill 'None' where missing value likely indicates absence of feature
    for col in ('PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 
                'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 
                'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 
                'BsmtFinType2', 'MasVnrType'):
        df[col] = df[col].fillna('None')

    # Fill with 0 where missing value likely indicates absence
    for col in ('GarageYrBlt', 'GarageArea', 'GarageCars', 'BsmtFinSF1', 
                'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 
                'BsmtHalfBath', 'MasVnrArea'):
        df[col] = df[col].fillna(0)

    # Impute missing values for 'LotFrontage' with median of neighborhood
    df['LotFrontage'] = df.groupby('Neighborhood')['LotFrontage'].transform(
        lambda x: x.fillna(x.median()))

    # For categorical features with low missing values, fill with mode
    for col in ('Electrical', 'MSZoning', 'KitchenQual', 'Exterior1st', 
                'Exterior2nd', 'SaleType', 'Functional', 'Utilities'):
        df[col] = df[col].fillna(df[col].mode()[0])

    return df

# Fill missing values in both training and test datasets
train_data_filled = fill_missing_values(train_data.copy())
test_data_filled = fill_missing_values(test_data.copy())

# Verify if there are any missing values left
missing_values_train = train_data_filled.isnull().sum().max()
missing_values_test = test_data_filled.isnull().sum().max()

missing_values_train, missing_values_test



(0, 0)

In [29]:
from sklearn.preprocessing import OneHotEncoder

# Identify categorical columns
categorical_cols = train_data_filled.select_dtypes(include=['object', 'category']).columns

# One-hot encode categorical features
ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)
X_train_encoded = pd.DataFrame(ohe.fit_transform(train_data_filled[categorical_cols]), index=train_data_filled.index)
X_test_encoded = pd.DataFrame(ohe.transform(test_data_filled[categorical_cols]), index=test_data_filled.index)

# Create feature names for encoded columns
encoded_cols = ohe.get_feature_names_out(categorical_cols)

X_train_encoded.columns = encoded_cols
X_test_encoded.columns = encoded_cols

# Drop original categorical columns and concatenate encoded ones
X_train_prepared = pd.concat([train_data_filled.drop(categorical_cols, axis=1), X_train_encoded], axis=1)
X_test_prepared = pd.concat([test_data_filled.drop(categorical_cols, axis=1), X_test_encoded], axis=1)

# Show the first few rows of the prepared training data
X_train_prepared.head()




Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [30]:
import numpy as np

# Identify numerical features
numerical_features = train_data_filled.select_dtypes(include=[np.number]).columns

# Function to calculate skewness
def calculate_skewness(df, features):
    skewness = df[features].apply(lambda x: x.skew()).sort_values(ascending=False)
    skewness = pd.DataFrame({'Skew' :skewness})
    return skewness

# Calculate skewness in training data
train_skewness = calculate_skewness(train_data_filled, numerical_features)
# Display features with skewness > 0.75 (common threshold for considering a feature skewed)
skewed_features = train_skewness[train_skewness['Skew'] > 0.75]
skewed_features


Unnamed: 0,Skew
MiscVal,24.476794
PoolArea,14.828374
LotArea,12.207688
3SsnPorch,10.304342
LowQualFinSF,9.011341
KitchenAbvGr,4.488397
BsmtFinSF2,4.255261
ScreenPorch,4.122214
BsmtHalfBath,4.103403
EnclosedPorch,3.089872


In [31]:
# Apply log transformation to skewed features
def log_transform_skewed_features(df, features):
    for feature in features:
        if feature != 'SalePrice': # Exclude target variable 'SalePrice' in training dataset
            df[feature] = np.log1p(df[feature])
    return df

# Get the names of the skewed features excluding 'SalePrice'
skewed_feature_names = skewed_features.index.tolist()

# Apply log transformation to training and test datasets
train_data_transformed = log_transform_skewed_features(train_data_filled.copy(), skewed_feature_names)
test_data_transformed = log_transform_skewed_features(test_data_filled.copy(), skewed_feature_names)

# Verify transformation by checking skewness again
train_skewness_after = calculate_skewness(train_data_transformed, skewed_feature_names)
train_skewness_after


Unnamed: 0,Skew
PoolArea,14.363102
3SsnPorch,7.734975
LowQualFinSF,7.460317
MiscVal,5.170704
BsmtHalfBath,3.933064
KitchenAbvGr,3.869414
ScreenPorch,3.150409
BsmtFinSF2,2.523694
EnclosedPorch,2.112275
SalePrice,1.882876


In [33]:
# Convert categorical variables to 'category' data type in both datasets
def convert_categorical(df):
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].astype('category')
    return df

train_data_final = convert_categorical(train_data_transformed)
test_data_final = convert_categorical(test_data_transformed)

# Display data types of a few columns to verify the conversion
train_data_final.dtypes.head(10)


Id                int64
MSSubClass      float64
MSZoning       category
LotFrontage     float64
LotArea         float64
Street         category
Alley          category
LotShape       category
LandContour    category
Utilities      category
dtype: object

In [21]:
from sklearn.preprocessing import OneHotEncoder

# Identify categorical columns
categorical_cols = train_data_filled.select_dtypes(include=['object', 'category']).columns

# One-hot encode categorical features
ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)
X_train_encoded = pd.DataFrame(ohe.fit_transform(train_data_filled[categorical_cols]), index=train_data_filled.index)
X_test_encoded = pd.DataFrame(ohe.transform(test_data_filled[categorical_cols]), index=test_data_filled.index)

# Create feature names for encoded columns
encoded_cols = ohe.get_feature_names_out(categorical_cols)

X_train_encoded.columns = encoded_cols
X_test_encoded.columns = encoded_cols

# Drop original categorical columns and concatenate encoded ones
X_train_prepared = pd.concat([train_data_filled.drop(categorical_cols, axis=1), X_train_encoded], axis=1)
X_test_prepared = pd.concat([test_data_filled.drop(categorical_cols, axis=1), X_test_encoded], axis=1)

# Show the first few rows of the prepared training data
X_train_prepared.head()





Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [34]:
train_data_final.to_csv("clean_train.csv", index=False)

In [35]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.preprocessing import OneHotEncoder

# Load the data (make sure to include your preprocessing steps before this)
train_data = pd.read_csv('clean_train.csv')

# Custom RMSE log scorer function
def rmse_log(y_true, y_pred):
    y_true = np.where(y_true <= 0, np.finfo(float).eps, y_true)
    y_pred = np.where(y_pred <= 0, np.finfo(float).eps, y_pred)
    return np.sqrt(mean_squared_error(np.log(y_true), np.log(y_pred)))

# Split the data into features and target variable
X = train_data.drop('SalePrice', axis=1)
y = train_data['SalePrice']

# Identify categorical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns

# One-hot encode categorical features
ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)
X_encoded = pd.DataFrame(ohe.fit_transform(X[categorical_cols]), index=X.index)
X_encoded.columns = ohe.get_feature_names_out(categorical_cols)

# Drop original categorical columns and concatenate encoded ones
X_prepared = pd.concat([X.drop(categorical_cols, axis=1), X_encoded], axis=1)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_prepared, y, test_size=0.2, random_state=42)

# Initialize and train the models
rf_model = RandomForestRegressor(random_state=42)
gb_model = GradientBoostingRegressor(random_state=42)
rf_model.fit(X_train, y_train)
gb_model.fit(X_train, y_train)

# Predict and evaluate
rf_pred = rf_model.predict(X_val)
gb_pred = gb_model.predict(X_val)
rf_rmse_log_score = rmse_log(y_val, rf_pred)
gb_rmse_log_score = rmse_log(y_val, gb_pred)

# Print the RMSE log scores
print("Random Forest RMSE Log Score:", rf_rmse_log_score)
print("Gradient Boosting RMSE Log Score:", gb_rmse_log_score)




Random Forest RMSE Log Score: 0.1535007157133027
Gradient Boosting RMSE Log Score: 0.13878339426154898
