# House Prices

### 1. Import Libraries

[link Kaggle for datasets](https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, LabelEncoder

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

### 2. Loading and Analyzing Data

In [2]:
df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
df.drop(columns=["Id"], inplace=True)

In [4]:
#Missing Values
df.isnull().sum()[df.columns[df.isnull().any()]]

LotFrontage      259
Alley           1369
MasVnrType       872
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [5]:
df.shape

(1460, 80)

- As we can see, there are columns with more than 80% of null values. Those columns are not useful for us and we will eliminate them.
- The other columns with nulls will be imputed (mean for numeric columns and 0 for categorical column)

### 3. Cleaning the Data

In [6]:
#Cleaning outliers
def cleaning_outliers(dataframe):
    """
    dataframe = name of your df
    """
    var_num = dataframe.select_dtypes(exclude="object").columns
    for v in var_num:
        Q1 = dataframe[v].quantile(0.25)
        Q3 = dataframe[v].quantile(0.75)
        IQR = Q3 - Q1
        lower_limit = Q1 - 1.5 * IQR
        upper_limit = Q3 + 1.5 * IQR
        dataframe[v] = dataframe[v].apply(lambda x: lower_limit if x < lower_limit else x)
        dataframe[v] = dataframe[v].apply(lambda x: upper_limit if x > upper_limit else x)
    return dataframe

In [7]:
df = cleaning_outliers(df)

In [8]:
#we separate the predictors from the target column
X = df.iloc[:,:-1]
y = df.iloc[:,-1:]

In [9]:
#we will clean the data with a nested function

def process_dataframe(dataframe):
    """
    Performs several operations on the dataframe
    1. Removes columns with more than 1000 null values.
    2. Imputes null values in numeric columns with the average.
    3. Imputes null values in categorical columns with 0 and applies label coding.
    """    
    def drop_missings(data):
        data = data.drop(columns=data.columns[data.isnull().sum()>1000])
        return data
    
    def imput_num(data):
        num_columns=data.select_dtypes(exclude="object").columns
        for n in num_columns:
            data[n] = data[n].fillna(data[n].mean())
        return data 
    
    def imput_cat(data):
        cat_columns=data.select_dtypes(include="object").columns
        data[cat_columns] = data[cat_columns].fillna(0)
    
        for c in cat_columns:
            le = LabelEncoder()
            data[c] = le.fit_transform(data[c].str.lower())
        return data

    dataframe = drop_missings(dataframe)
    dataframe = imput_num(dataframe)
    dataframe = imput_cat(dataframe)

    return dataframe

In [10]:
X = process_dataframe(X)

### 4. Splitting and transforming data

In [11]:
#Import Libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

In [12]:
#Splitting data
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.20, random_state=0)

In [13]:
#selecting the numeric columns to use them later when transforming the data
var_num=df.select_dtypes(exclude="object").columns.drop("SalePrice")

In [14]:
#standardizing numerical variables
for v in var_num:
    sc = StandardScaler()
    X_train[v] = sc.fit_transform(X_train[[v]])
    X_test[v] = sc.transform(X_test[[v]])

### 5. Model

In [15]:
#import libraries
from sklearn.ensemble import RandomForestRegressor

In [16]:
%%time
#Searching for the best n_estimator and the best max_leaf_nodes
best_mae = float("inf")
n_estimators = []
max_leaf_nodes = []

max_leaf=[20, 25, 50, 100, 200, 300]
max_estimator = list(range(1,81,1))

for m in max_leaf:
    for e in max_estimator:
        rf = RandomForestRegressor(n_estimators = e, max_leaf_nodes = m, random_state = 42)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_test)
        mae = mean_absolute_error(y_test, y_pred)
        if mae < best_mae:
            best_mae = mae
            n_estimators = e
            max_leaf_nodes = m

print(f"Best max_leaf_nodes: {max_leaf_nodes}")
print(f"Best n_estimators: {n_estimators}")
print(f"Best MAE: {best_mae}")

Best max_leaf_nodes: 300
Best n_estimators: 80
Best MAE: 15101.193776226868
CPU times: total: 6min 52s
Wall time: 7min 45s


In [17]:
rf = RandomForestRegressor(n_estimators = 80, max_leaf_nodes=300, random_state=42)
rf.fit(X_train, y_train)

In [18]:
y_pred = rf.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"MAE: {mae:.3f}")
print(f"R2_score {r2:.2%}")

MAE: 15101.194
R2_score 88.71%


## Loading Test Data

In [19]:
df_test = pd.read_csv("test.csv")
df_test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [20]:
#store the column id in df_test_id and drop it for the original df_test
df_test_id = df_test["Id"]
df_test.drop(columns=["Id"], inplace=True)

In [21]:
#Cleaning Outliers from df_test
df_test=cleaning_outliers(df_test)

In [22]:
#Processing the data, nulls and encoding
df_test=process_dataframe(df_test)

In [23]:
#Predicting values
test_predict=rf.predict(df_test)

In [24]:
output = pd.DataFrame({"id":df_test_id, "SalePrice":test_predict})
output.to_csv('submission.csv', index=False)