In [2]:
# Import the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score , make_scorer , mean_squared_log_error
from sklearn.ensemble import  RandomForestRegressor , GradientBoostingRegressor , VotingRegressor , StackingRegressor
from xgboost import XGBRegressor
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

# Remove Warnings
import warnings
warnings.filterwarnings('ignore')


In [3]:
df = pd.read_csv("/content/data.csv")


FileNotFoundError: [Errno 2] No such file or directory: '/content/data.csv'

In [None]:
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', None)  # No limit on display width
df.head(10)



In [None]:
df.info()

In [None]:
#splitting the data into numerical and categorical
numerical = df.select_dtypes(include = ['float64', 'int64']).columns.tolist()
categorical = df.select_dtypes(include = ['object']).columns.tolist()

print('number of numerical features is: ' , len(numerical))
print('number of categorical features is: ' , len(categorical))

In [None]:
df[numerical].describe().T

In [None]:
df[categorical].describe().T

In [None]:
fig, axes = plt.subplots(13, 3, figsize=(20, 5 * 13))
axes = axes.flatten()

for i, feature in enumerate(df[numerical]):
    sns.scatterplot(ax=axes[i], x=df[feature], y=df['SalePrice'])
    axes[i].set_title(f'SalePrice vs {feature}')
    axes[i].set_xlabel(feature)
    axes[i].set_ylabel('SalePrice')

for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

In [None]:
#outlier detection

In [None]:
# Identifying two data points with the highest SalePrice.
df.sort_values(by = 'SalePrice', ascending = False)[:2]

In [None]:
# Identifying two data points with the lowest SalePrice.
df.sort_values(by = 'SalePrice', ascending = True)[:2]

In [None]:
# Removing the data points that we identified above. We will remove them using the values from the Id column.

df = df.drop(df[df['Id'] == 692].index)
df = df.drop(df[df['Id'] == 1183].index)
df = df.drop(df[df['Id'] == 1537].index)
df = df.drop(df[df['Id'] == 2217].index)

In [None]:
var = 'GrLivArea'
data = pd.concat([df['SalePrice'] , df[var]] , axis =1)
data.plot.scatter(x = var , y ='SalePrice', ylim = (0,800000));

In [None]:
# Identifying points
df.sort_values(by = 'GrLivArea', ascending = False)[:3]

In [None]:
# Removing the data points that we identified above. We will remove them using the values from the Id column.

df = df.drop(df[df['Id'] == 1299].index)
df = df.drop(df[df['Id'] == 1170].index)
df = df.drop(df[df['Id'] == 524].index)

In [None]:
var = 'LotFrontage'
data = pd.concat([df['SalePrice'] , df[var]] , axis =1)
data.plot.scatter(x = var , y ='SalePrice', ylim = (0,800000));

In [None]:
# Identifying points
df.sort_values(by = 'LotFrontage', ascending = False)[:1]

In [None]:
# deleting points
df = df.drop(df[df['Id'] == 935].index)

In [None]:
var = 'MasVnrArea'
data = pd.concat([df['SalePrice'] , df[var]] , axis =1)
data.plot.scatter(x = var , y ='SalePrice', ylim = (0,800000));

In [None]:
# Identifying points
df.sort_values(by = 'MasVnrArea', ascending = False)[:1]

In [None]:
# deleting points
df = df.drop(df[df['Id'] == 298].index)

In [None]:
var = 'BsmtFinSF1'
data = pd.concat([df['SalePrice'] , df[var]] , axis =1)
data.plot.scatter(x = var , y ='SalePrice', ylim = (0,800000));

In [None]:
var = 'TotalBsmtSF'
data = pd.concat([df['SalePrice'] , df[var]] , axis =1)
data.plot.scatter(x = var , y ='SalePrice', ylim = (0,800000));

In [None]:
var = '1stFlrSF'
data = pd.concat([df['SalePrice'] , df[var]] , axis =1)
data.plot.scatter(x = var , y ='SalePrice', ylim = (0,800000));

In [None]:
var = 'GarageArea'
data = pd.concat([df['SalePrice'] , df[var]] , axis =1)
data.plot.scatter(x = var , y ='SalePrice', ylim = (0,800000));

In [None]:
var = 'OpenPorchSF'
data = pd.concat([df['SalePrice'] , df[var]] , axis =1)
data.plot.scatter(x = var , y ='SalePrice', ylim = (0,800000));

In [None]:
# Identifying points
df.sort_values(by = 'OpenPorchSF', ascending = False)[:5]

In [None]:
# deleting points

df = df.drop(df[df['Id'] == 496].index)
df = df.drop(df[df['Id'] == 584].index)

In [None]:
var = 'LotArea'
data = pd.concat([df['SalePrice'] , df[var]] , axis =1)
data.plot.scatter(x = var , y ='SalePrice', ylim = (0,800000));

In [None]:
# Identifying points
df.sort_values(by = 'LotArea', ascending = False)[:4]

In [None]:
# deleting points
df = df.drop(df[df['Id'] == 314].index)
df = df.drop(df[df['Id'] == 336].index)
df = df.drop(df[df['Id'] == 250].index)
df = df.drop(df[df['Id'] == 707].index)

In [None]:
var = 'GarageYrBlt'
data = pd.concat([df['SalePrice'] , df[var]] , axis =1)
data.plot.scatter(x = var , y ='SalePrice', ylim = (0,800000));

In [None]:
var = 'WoodDeckSF'
data = pd.concat([df['SalePrice'] , df[var]] , axis =1)
data.plot.scatter(x = var , y ='SalePrice', ylim = (0,800000));

In [None]:
var = 'EnclosedPorch'
data = pd.concat([df['SalePrice'] , df[var]] , axis =1)
data.plot.scatter(x = var , y ='SalePrice', ylim = (0,800000));

In [None]:
# filling missing values, feature engineering and encoding

In [None]:
df.shape

In [None]:
df.columns

In [None]:
# Calculate the ratio of missing data
null_data_ratio = (df.isnull().sum()) * 100 / df.shape[0]
null_data_ratio = null_data_ratio.drop(null_data_ratio[null_data_ratio == 0].index).sort_values(ascending=False)
null_data_ratio

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

for column in df.select_dtypes(include=np.number).columns:
    plt.figure() # create a new figure for each plot
    df.boxplot(column=column)
    plt.title(f'Box plot of {column}')
    plt.ylabel('Value')
    plt.show()

In [None]:
def Missing_values(df):
    df['MasVnrType'].fillna(df['MasVnrType'].mode()[0] , inplace = True)
    df['MasVnrArea'].fillna(df['MasVnrArea'].mean(), inplace = True)


    NA_features = ['PoolQC','BsmtQual', 'BsmtCond', 'BsmtExposure', 'FireplaceQu', 'GarageFinish',
                 'GarageQual', 'GarageCond', 'GarageType' , 'Fence', 'Alley', 'BsmtFinType1', 'BsmtFinType2',
                 'MiscFeature']
    for i in NA_features:
        df[i].fillna('No' , inplace = True)

    # Zero imputing missing values:
    missing_features_1 = ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF',
                          'BsmtFullBath', 'BsmtHalfBath','GarageArea', 'GarageCars']
    for col in missing_features_1:
        df[col] = df[col].fillna(0)

    # imputing with mode:
    missing_features_2 = ['MSZoning', 'Electrical', 'KitchenQual', 'Exterior1st', 'Exterior2nd',
                         'SaleType', 'Utilities']
    for col in missing_features_2:
        df[col] = df[col].fillna(df[col].mode()[0])

    # Regarding the LotFrontage feature, the best approach is to use the mode values to fill in the missing data for different categories of the Neighborhood.
    # LotFrontage:
    df["LotFrontage"] = df.groupby("Neighborhood")["LotFrontage"].transform(
       lambda x: x.fillna(x.median()))

    # Functional :
    df['Functional'] = df['Functional'].fillna('Typ')

    return df

In [None]:
def Feature_engineering(df):
    df['Age_of_property'] = (df['YrSold'] - df['YearBuilt']) *12 + df['MoSold']
    df['Age_of_remodel'] = (df['YrSold'] - df['YearRemodAdd']) * 12 + df['MoSold']
    df['Age_of_garage'] = (df['YrSold'] - df['GarageYrBlt']) * 12 + df['MoSold']
    df['Age_of_garage'].fillna(0 , inplace = True)
    df['TotalSF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']
    df['TotalBathrooms'] = (df['FullBath'] + (0.5 * df['HalfBath']) +
                             df['BsmtFullBath'] + (0.5 * df['BsmtHalfBath']))
    df['TotalPorchSF'] = df['WoodDeckSF'] + df['OpenPorchSF'] + df['EnclosedPorch'] + df['3SsnPorch'] + df['ScreenPorch']

    items_to_drop = ['YrSold' , 'YearBuilt' , 'MoSold' ,  'YearRemodAdd' , 'GarageYrBlt' ,  'TotalBsmtSF'
                   , '1stFlrSF' , '2ndFlrSF' ,  'FullBath' , 'HalfBath' , 'BsmtFullBath' , 'BsmtHalfBath' , 'WoodDeckSF' , 'OpenPorchSF'
                     , 'EnclosedPorch' , '3SsnPorch' , 'ScreenPorch']
    df.drop(items_to_drop , axis = 1 , inplace =True)

    return df

In [None]:
def cat_to_num(df):

    ordinal_features = ['LotShape' , 'LandContour' , 'LandSlope' , 'ExterQual' , 'ExterCond' , 'BsmtQual', 'BsmtCond' , 'BsmtExposure', 'BsmtFinType1' , 'BsmtFinType2' , 'HeatingQC',
                        'Electrical' , 'KitchenQual' , 'FireplaceQu' , 'GarageFinish' , 'GarageQual' , 'GarageCond', 'PavedDrive' , 'PoolQC' , 'Fence' , 'CentralAir']

    ordinal_categories = [
            ['IR3' , 'IR2' , 'IR1' , 'Reg'] #LotShape categories
            ,['Low' , 'HLS' , 'Bnk' , 'Lvl'] #LandContour categories
            ,['Sev' , 'Mod' , 'Gtl'] #LandSlope categories
            ,['Po' , 'Fa' , 'TA' , 'Gd' , 'Ex'] #ExterQual categories
            ,['Po' , 'Fa' , 'TA' , 'Gd' , 'Ex'] #ExterCond categories
            ,['No' , 'Po' , 'Fa' , 'TA' , 'Gd' , 'Ex'] #BsmtQual categories
            ,['No' , 'Po' , 'Fa' , 'TA' , 'Gd' , 'Ex'] #BsmtCond categories
            ,['No' , 'Mn' , 'Av' , 'Gd'] #BsmtExposure categories
            ,['No' , 'Unf' , 'LwQ' , 'Rec' , 'BLQ' , 'ALQ' , 'GLQ'] #BsmtFinType1 categories
            ,['No' , 'Unf' , 'LwQ' , 'Rec' , 'BLQ' , 'ALQ' , 'GLQ'] #BsmtFinType2 categories
            ,['Po' , 'Fa' , 'TA' , 'Gd' , 'Ex'] #HeatingQC categories
            ,['Mix' , 'FuseP' , 'FuseF' , 'FuseA' , 'SBrkr'] #Electrical categories
            ,['Po' , 'Fa' , 'TA' , 'Gd' , 'Ex'] #KitchenQual categories
            ,['No' , 'Po' , 'Fa' , 'TA' , 'Gd' , 'Ex'] #FireplaceQu categories
            ,['No' , 'Unf' , 'RFn' , 'Fin'] #GarageFinish categories
            ,['No' , 'Po' , 'Fa' , 'TA' , 'Gd' , 'Ex'] #GarageQual categories
            ,['No' , 'Po' , 'Fa' , 'TA' , 'Gd' , 'Ex'] #GarageCond categories
            ,['N' , 'P' , 'Y'] #PavedDrive categories
            ,['No' , 'Fa' , 'TA' , 'Gd' , 'Ex'] #PoolQC categories
            ,['No' , 'MnWw' , 'GdWo' , 'MnPrv' , 'GdPrv'] #Fence categories
            ,['N' , 'Y'] #CentralAir categories
            ]
    encoder = OrdinalEncoder(categories = ordinal_categories)
    df[ordinal_features] = encoder.fit_transform(df[ordinal_features])

    return df

In [None]:
def transformer(df):

    mv = Missing_values(df)

    Fe = Feature_engineering(mv)

    df = cat_to_num(Fe)

    return df

In [None]:
df = transformer(df)

In [None]:
df['MSSubClass'] = df['MSSubClass'].apply(str)
nominal_features = ['MSSubClass' , 'MSZoning' , 'Street' , 'Alley' , 'Utilities' , 'LotConfig' , 'Neighborhood' , 'Condition1' , 'Condition2' , 'BldgType' , 'HouseStyle'
                    , 'RoofStyle' , 'RoofMatl' , 'Exterior1st', 'Exterior2nd' , 'MasVnrType' , 'Foundation' , 'Heating' , 'Functional' , 'GarageType' , 'MiscFeature'
                    ,'SaleType' ,  'SaleCondition']

df = pd.get_dummies(df, columns=nominal_features, drop_first=True)


bool_columns = df.select_dtypes(include=['bool']).columns  # انتخاب ستون‌های Boolean

for col in bool_columns:
    df[col] = df[col].astype(int)

In [None]:
# feature selection
df.head(10)

In [None]:
corr_train=df.corr()
low_corr_features = corr_train[(corr_train['SalePrice'] < 0.1) & (corr_train['SalePrice']> -0.1)].index
df_filtered = df.drop(columns=low_corr_features)

corr_train=df_filtered.corr()
print(corr_train['SalePrice'].sort_values(ascending = False))
plt.figure(figsize=(34, 30))

plt.title("Correlation Matrix",color="red",fontsize=15)
sns.heatmap(corr_train, annot=False, cmap='coolwarm', fmt ='.2f')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
corr_train=df_filtered.corr()
high_corr_pairs_with_values = []

# Identifying features with a correlation higher than 0.7
threshold = 0.7
for col in corr_train.columns:
    for row in corr_train.index:
        if col != row and abs(corr_train.loc[row, col]) > threshold:
            high_corr_pairs_with_values.append((row, col, corr_train.loc[row, col]))

# Displaying correlated features along with their correlation values
for feature1, feature2, corr_value in high_corr_pairs_with_values:
    print(f" {feature1} - {feature2}: {corr_value:.2f}")

In [None]:
drop_items= ['ExterQual' , 'BsmtFinSF1' , 'GrLivArea' , 'Fireplaces' , 'GarageArea' , 'GarageCond' , 'Exterior1st_VinylSd'  , 'HouseStyle_2Story' ,
               'Exterior1st_VinylSd' , 'GarageType_No' , 'MSZoning_RL' , 'RoofStyle_Gable' , 'Foundation_CBlock' , 'GarageType_Attchd' , 'SaleType_WD' , 'SaleCondition_Partial'
                , 'BldgType_Duplex' , 'Exterior1st_Wd Sdng' , 'MasVnrType_BrkFace' , 'Exterior2nd_CmentBd' , 'Exterior2nd_MetalSd', 'MSSubClass_90' , 'Age_of_garage' ,
              'MSSubClass_190' , 'MSSubClass_90']
df_filtered.drop(drop_items , axis = 1 , inplace = True)

In [None]:
# Step 1: Add a constant (intercept) for the regression model
X = add_constant(df_filtered)

# Step 2: Create a DataFrame to store VIF values
vif_data = pd.DataFrame()
vif_data['Feature'] = X.columns

# Step 3: Calculate VIF for each feature
vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

# Display VIF values
print(vif_data)

In [None]:
df_filtered.head()

In [None]:
# Separating the target from the features in order to continue the model-building process.
target = df_filtered['SalePrice']
features = df_filtered.drop('SalePrice' , axis =1)
feature_names = features.columns

In [None]:
# Standardizing the features using the StandardScaler method.
scaler = StandardScaler()
scaled_features = pd.DataFrame(scaler.fit_transform(features) , columns = features.columns)

In [None]:
# Splitting the data into training and test sets.
x_train , x_test, y_train , y_test  = train_test_split(scaled_features , target ,test_size = 0.2, random_state = 42)

In [None]:

# Initialize and train the Linear Regression model
model = LinearRegression()
model.fit(x_train,y_train)

# Make predictions
y_pred = model.predict(x_test)

# Model evaluation
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Print evaluation metrics
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R-squared (R²):", r2)
# Calculate R-squared
r2 = r2_score(y_test, y_pred)

# Number of observations and predictors
n = len(y_test)
p = x_test.shape[1]

# Adjusted R-squared formula
adjusted_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)

print("Adjusted R-squared:", adjusted_r2)


# Plotting the actual vs predicted values
plt.figure(figsize=(10, 6))
sns.scatterplot(x=y_test, y=y_pred)
plt.title("Actual vs Predicted Prices")
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
plt.show()

# Plot residuals
residuals = y_test - y_pred
plt.figure(figsize=(10, 6))
sns.histplot(residuals, kde=True, bins=10)
plt.title("Residuals Distribution")
plt.xlabel("Residuals")
plt.ylabel("Frequency")
plt.show()

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor  # For regression
from sklearn.metrics import mean_squared_error, r2_score  # Evaluation metrics
from sklearn.preprocessing import StandardScaler #Scaling


#Scale the features (important for many ML algorithms)
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train) #Fit and transform training data
x_test_scaled = scaler.transform(x_test) #Transform test data using the fitted scaler

x_train_scaled = pd.DataFrame(x_train_scaled, columns = x_train.columns)
x_test_scaled = pd.DataFrame(x_test_scaled, columns = x_test.columns)


#Create and train the Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)  # Adjust hyperparameters as needed
rf_model.fit(x_train_scaled, y_train)

#Make predictions on the test set
y_pred = rf_model.predict(x_test_scaled)

#Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = mse**0.5 #Root Mean Squared Error
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R-squared: {r2}")
print("-----------------------------------------------------------------------")


#Hyperparameter Tuning using GridSearchCV or RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs = -1)
grid_search.fit(x_train_scaled, y_train)

best_rf_model = grid_search.best_estimator_

y_pred_tuned = best_rf_model.predict(x_test_scaled)
mse_tuned = mean_squared_error(y_test, y_pred_tuned)
rmse_tuned = mse_tuned**0.5
r2_tuned = r2_score(y_test, y_pred_tuned)

print(f"Tuned Mean Squared Error: {mse_tuned}")
print(f"Tuned Root Mean Squared Error: {rmse_tuned}")
print(f"Tuned R-squared: {r2_tuned}")

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor  # For regression
from sklearn.metrics import mean_squared_error, r2_score  # Evaluation metrics
from sklearn.preprocessing import StandardScaler



#Scale features (important!)
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

x_train_scaled = pd.DataFrame(x_train_scaled, columns = x_train.columns)
x_test_scaled = pd.DataFrame(x_test_scaled, columns = x_test.columns)

#Create and train the Gradient Boosting Regressor
gb_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)  # Adjust hyperparameters
gb_model.fit(x_train_scaled, y_train)

#Make predictions
y_pred = gb_model.predict(x_test_scaled)

#Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = mse**0.5
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R-squared: {r2}")
print("-----------------------------------------------------------------------")


#Hyperparameter Tuning (using GridSearchCV example)
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.5],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=gb_model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs = -1)
grid_search.fit(x_train_scaled, y_train)

best_gb_model = grid_search.best_estimator_

y_pred_tuned = best_gb_model.predict(x_test_scaled)
mse_tuned = mean_squared_error(y_test, y_pred_tuned)
rmse_tuned = mse_tuned**0.5
r2_tuned = r2_score(y_test, y_pred_tuned)

print(f"Tuned Mean Squared Error: {mse_tuned}")
print(f"Tuned Root Mean Squared Error: {rmse_tuned}")
print(f"Tuned R-squared: {r2_tuned}")