In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

import warnings
warnings.filterwarnings('ignore')

# Load the CSV file
file_path = 'train.csv'
df = pd.read_csv(file_path)

# Display the first few rows of the dataframe
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


## EDA

In [3]:
# Plot missing values
missing_values = df.isnull().sum().reset_index()
missing_values.columns = ['Column', 'Missing Values']
fig = px.bar(missing_values, x='Column', y='Missing Values', title='Missing Values per Column')
fig.show()

In [4]:
# Univariate Analysis - Distribution of numeric features
numeric_features = df.select_dtypes(include=[np.number]).columns

for col in numeric_features[1:]:
    fig = px.histogram(df, x=col, nbins=50, title=f'Distribution of {col}', marginal='box')
    fig.show()

In [5]:
# Filter out non-numeric columns for correlation matrix
numeric_df = df.select_dtypes(include=[np.number])

# Correlation heatmap
correlation_matrix = numeric_df.corr()

fig = go.Figure(data=go.Heatmap(
    z=correlation_matrix.values,
    x=correlation_matrix.columns,
    y=correlation_matrix.columns,
    colorscale='Cividis'  # Using Cividis colorscale
))

fig.update_layout(title='Correlation Heatmap', xaxis_nticks=36)
fig.show()

In [6]:
# Boxplots to detect outliers
for col in numeric_features:
    fig = px.box(df, y=col, title=f'Boxplot of {col}')
    fig.show()

## Data Preparation

In [7]:
# For numeric columns, impute with median
numeric_columns = df.select_dtypes(include=[np.number]).columns
numeric_imputer = SimpleImputer(strategy='median')
df[numeric_columns] = numeric_imputer.fit_transform(df[numeric_columns])

# For categorical columns, impute with most frequent value
categorical_columns = df.select_dtypes(include=['object']).columns
categorical_imputer = SimpleImputer(strategy='most_frequent')
df[categorical_columns] = categorical_imputer.fit_transform(df[categorical_columns])

print("Missing values after imputation:")
print(df.isnull().sum().sum())

Missing values after imputation:
0


In [8]:
le = LabelEncoder()
for col in categorical_columns:
    df[col] = le.fit_transform(df[col].astype(str))

print("Categorical columns after encoding:")
print(df[categorical_columns].head())

Categorical columns after encoding:
   MSZoning  Street  Alley  LotShape  LandContour  Utilities  LotConfig  \
0         3       1      0         3            3          0          4   
1         3       1      0         3            3          0          2   
2         3       1      0         0            3          0          4   
3         3       1      0         0            3          0          0   
4         3       1      0         0            3          0          2   

   LandSlope  Neighborhood  Condition1  ...  GarageType  GarageFinish  \
0          0             5           2  ...           1             1   
1          0            24           1  ...           1             1   
2          0             5           2  ...           1             1   
3          0             6           2  ...           5             2   
4          0            15           2  ...           1             1   

   GarageQual  GarageCond  PavedDrive  PoolQC  Fence  MiscFeature  SaleTyp

In [9]:
scaler = StandardScaler()
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])

print("Numeric columns after scaling:")
print(df[numeric_columns].describe())

Numeric columns after scaling:
                Id    MSSubClass   LotFrontage       LotArea   OverallQual  \
count  1460.000000  1.460000e+03  1.460000e+03  1.460000e+03  1.460000e+03   
mean      0.000000 -8.455945e-17  2.798370e-16 -5.840077e-17  1.387018e-16   
std       1.000343  1.000343e+00  1.000343e+00  1.000343e+00  1.000343e+00   
min      -1.730865 -8.725628e-01 -2.219047e+00 -9.237292e-01 -3.688413e+00   
25%      -0.865432 -8.725628e-01 -4.479400e-01 -2.969908e-01 -7.951515e-01   
50%       0.000000 -1.631095e-01 -3.922314e-02 -1.040633e-01 -7.183611e-02   
75%       0.865432  3.098594e-01  4.149067e-01  1.087080e-01  6.514792e-01   
max       1.730865  3.147673e+00  1.104155e+01  2.051827e+01  2.821425e+00   

        OverallCond     YearBuilt  YearRemodAdd    MasVnrArea    BsmtFinSF1  \
count  1.460000e+03  1.460000e+03  1.460000e+03  1.460000e+03  1.460000e+03   
mean   3.540547e-16  1.046347e-15  4.496860e-15 -3.893385e-17 -2.433366e-17   
std    1.000343e+00  1.000343

In [10]:
scaler = StandardScaler()
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])

print("Numeric columns after scaling:")
print(df[numeric_columns].describe())

Numeric columns after scaling:
                Id    MSSubClass   LotFrontage       LotArea   OverallQual  \
count  1460.000000  1.460000e+03  1.460000e+03  1.460000e+03  1.460000e+03   
mean      0.000000 -6.083414e-19 -2.798370e-17  2.433366e-18  2.433366e-18   
std       1.000343  1.000343e+00  1.000343e+00  1.000343e+00  1.000343e+00   
min      -1.730865 -8.725628e-01 -2.219047e+00 -9.237292e-01 -3.688413e+00   
25%      -0.865432 -8.725628e-01 -4.479400e-01 -2.969908e-01 -7.951515e-01   
50%       0.000000 -1.631095e-01 -3.922314e-02 -1.040633e-01 -7.183611e-02   
75%       0.865432  3.098594e-01  4.149067e-01  1.087080e-01  6.514792e-01   
max       1.730865  3.147673e+00  1.104155e+01  2.051827e+01  2.821425e+00   

        OverallCond    YearBuilt  YearRemodAdd    MasVnrArea    BsmtFinSF1  \
count  1.460000e+03  1460.000000  1.460000e+03  1.460000e+03  1.460000e+03   
mean  -2.068361e-17     0.000000  1.460019e-17 -3.893385e-17 -1.946692e-17   
std    1.000343e+00     1.000343

In [11]:
X = df.drop('SalePrice', axis=1)
y = df['SalePrice']

print("Features shape:", X.shape)
print("Target shape:", y.shape)

Features shape: (1460, 80)
Target shape: (1460,)


In [12]:
# Cell 7: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

Training set shape: (1168, 80)
Testing set shape: (292, 80)


## Model Training

In [13]:
rf = RandomForestRegressor(random_state=42)
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_features': ['auto', 'sqrt'],
    'max_depth': [10, 20, 30, 40, 50, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [14]:
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, 
                               n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)

In [15]:
rf_random.fit(X_train, y_train.values.ravel())

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [16]:
print("Best parameters found:")
print(rf_random.best_params_)

Best parameters found:
{'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 40}


## Prediction

In [17]:
y_pred = rf_random.predict(X_test)

In [18]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R-squared Score: {r2}")

Mean Squared Error: 0.15376505380917205
Root Mean Squared Error: 0.39212887398044544
R-squared Score: 0.8735694336858014


In [19]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=y_test.values.ravel(), y=y_pred,
                         mode='markers',
                         name='Predictions',
                         marker=dict(color='blue', opacity=0.5)))

fig.add_trace(go.Scatter(x=[y_test.min(), y_test.max()], 
                         y=[y_test.min(), y_test.max()],
                         mode='lines',
                         name='Ideal',
                         line=dict(color='red', dash='dash')))

fig.update_layout(title='Actual vs Predicted House Prices',
                  xaxis_title='Actual Price',
                  yaxis_title='Predicted Price',
                  showlegend=True)

fig.show()

In [20]:
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': rf_random.best_estimator_.feature_importances_
}).sort_values('importance', ascending=False)

fig = px.bar(feature_importance.head(10), 
             x='feature', y='importance', 
             title='Top 10 Most Important Features')

fig.update_layout(xaxis_title='Features',
                  yaxis_title='Importance',
                  xaxis_tickangle=-45)

fig.show()

In [21]:
residuals = y_test.values.ravel() - y_pred

fig = go.Figure()

fig.add_trace(go.Scatter(x=y_pred, y=residuals,
                         mode='markers',
                         marker=dict(color='blue', opacity=0.5)))

fig.add_hline(y=0, line_dash="dash", line_color="red")

fig.update_layout(title='Residual Plot',
                  xaxis_title='Predicted Values',
                  yaxis_title='Residuals')

fig.show()

In [22]:
fig = px.histogram(residuals, nbins=50,
                   title='Distribution of Residuals')

fig.update_layout(xaxis_title='Residuals',
                  yaxis_title='Count')

fig.show()