In [1]:
import os, json, IPython
project_name = "house-prices-advanced-regression-techniques"
token = {"username":"santanukundu","key":"de462df622bbe0817a45f17f3c1373eb"}
os.environ['KAGGLE_CONFIG_DIR']='.'
with open('kaggle.json', "w") as f:
    json.dump(token, f)
!chmod 600 ./kaggle.json
!kaggle competitions download -c $project_name
project_name += ".zip"
!unzip $project_name && rm $project_name
IPython.display.clear_output()

In [None]:
!cat data_description.txt

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge

In [3]:
class HousePricePrediction():

    def __init__(self):
        self.train = pd.read_csv('train.csv')
        self.test = pd.read_csv('test.csv')
        self.numerical_features   = list(self.train.select_dtypes(exclude='object').columns)
        self.categorical_features = list(self.train.select_dtypes(include='object').columns)
        self.dropping_cols = ['Id', 'TotalBsmtSF', 'GrLivArea', 'Alley',
                              'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature',
                              'GarageCars', 'GarageYrBlt',

                              'YrSold'
                              ]
        self.target = 'SalePrice'

        self.test_size = 0.2
        self.model = Ridge(alpha= 10.0, random_state= 42, solver= 'saga', tol= 0.1)

        self.predicted_data = pd.DataFrame()

        self.saved_file_name = 'output.csv'

        self.pipeline()

    """
    preprocessing includes
    1. categorical to numerical conversion
    2. save mapper object so we can use it for test_data
    3. save scaler objects
    """
    def preprocess(self, df):
        calc_cols = ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']
        for i in calc_cols:
            df[i] = df['YrSold'] - df[i]

        df['YrSold'] = 2023 - df['YrSold']

        # filling na values
        # numerical cols will be filled with 0
        # categorical ones will be filled with 'Not present'
        self.mapper_na_to_val = dict()
        for i in self.numerical_features:
            self.mapper_na_to_val[i] = 0
        for i in self.categorical_features:
            self.mapper_na_to_val[i]='Not present'

        df = df.fillna(value= self.mapper_na_to_val)
        return df

    def encoding(self,df):
        self.categorical_mapper = {}
        for i in self.categorical_features:
            self.categorical_mapper[i] = dict(df.groupby(i)['SalePrice'].mean())
            df[i] = df.groupby(i)['SalePrice'].transform('mean')
            if 'Not present' not in self.categorical_mapper[i].keys():
                self.categorical_mapper[i]['Not present'] = df['SalePrice'].mean()
        return df

    def training(self,df):
        """"
        Drop columns if required
        """
        self.scaler_X, self.scaler_Y = MinMaxScaler(), MinMaxScaler()

        y = df[[self.target]]
        X = df.drop([self.target] + self.dropping_cols ,axis=1)
        self.cols = X.columns

        X, y = self.scaler_X.fit_transform(X), self.scaler_Y.fit_transform(y)

        X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=self.test_size, random_state=42)

        self.model.fit(X_train, y_train)

        print("R2 score for training data: ", self.model.score(X_train, y_train))
        print("R2 score for testing data: ", self.model.score(X_test, y_test))

    def prediction(self,df):
        x = self.preprocess(df).replace(self.categorical_mapper).drop(self.dropping_cols, axis=1)

        self.predicted_data['Id'] = df['Id']
        self.predicted_data[self.target] = self.scaler_Y.inverse_transform(
            self.model.predict(self.scaler_X.transform(x))
            )
        self.predicted_data.to_csv(self.saved_file_name,index=False)

    def pipeline(self):
        self.training(self.encoding(self.preprocess(self.train)))
        self.prediction(self.test)
    def coef_(self):
        return pd.DataFrame(zip(self.cols,self.model.coef_.flatten()), columns= ['feature_name', 'weight']
                    ).sort_values('weight', ascending=False, key= lambda x: abs(x)).reset_index(drop=True)
    def intercept_(self):
        return self.model.intercept_

In [15]:
prediction = HousePricePrediction()

R2 score for training data:  0.8501342012982918
R2 score for testing data:  0.8498746666722312


In [None]:
R2 score for training data:  0.8657467374426706
R2 score for testing data:  0.876627500886147

In [6]:
prediction.coef_().head(10)

Unnamed: 0,feature_name,weight
0,1stFlrSF,0.277823
1,Utilities,0.119632
2,Condition2,-0.116174
3,2ndFlrSF,0.113374
4,OverallQual,0.102108
5,Neighborhood,0.092377
6,RoofMatl,0.087095
7,KitchenAbvGr,-0.059544
8,OverallCond,0.053805
9,TotRmsAbvGrd,0.052968


In [7]:
prediction.intercept_()

array([-0.26273581])

In [8]:
model_params ={
    'Ridge' : {
            'alpha' : [1.0, 10.0, 100.0],
            'tol'   : [ 0.001, 0.1, 1.0, 10],
            'solver': ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'],
      'random_state': [42]
        },
 'SVR'  : {
      'kernel'      : [ 'linear', 'poly', 'rbf', 'sigmoid'],
      'degree'      : [2, 3, 4 , 5, 6],
      'gamma'       : ['scale', 'auto'],
      'tol'         : [1e-5, 1e-4, 1e-3, 1e-2, 1e-1],
      'epsilon'     : [.1, .3, .5, .7, .9],
      'C'           : [.01, .1, 1, 10],
        },
 'GBR' : {
      'loss'        : [ 'squared_error', 'huber'],
    'learning_rate' : [ .05, 0.1, .5],
      'n_estimators': [200, 500, 800],
      'max_features': ['sqrt', 'log2'],
      'alpha'       : [.1, .5, .9],
      'random_state': [42],

    }
}

In [11]:
class HousePricePrediction():

    def __init__(self):
        # Basic config
        self.train = pd.read_csv('train.csv')
        self.test = pd.read_csv('test.csv')
        self.numerical_features   = list(self.train.select_dtypes(exclude='object').columns)
        self.categorical_features = list(self.train.select_dtypes(include='object').columns)
        self.dropping_cols = ['Id', 'Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature']
        self.target = 'SalePrice'

        # Training test split
        self.test_size = 0.15

        # Choosing Model
        self.model = Ridge()

        # GridSearchCV parameters
        self.cv = 5
        self.params = model_params['Ridge']

        # For prediction and saving data
        self.predicted_data = pd.DataFrame()
        self.saved_file_name = 'output.csv'

        # Start training your model
        self.pipeline()

    """
    preprocessing includes
    1. categorical to numerical conversion
    """
    def preprocess(self, df):
        calc_cols = ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']
        for i in calc_cols:
            df[i] = df['YrSold'] - df[i]

        df['YrSold'] = 2023 - df['YrSold']

        # filling na values
        # numerical cols will be filled with 0
        # categorical ones will be filled with 'Not present'
        self.mapper_na_to_val = dict()
        for i in self.numerical_features:
            self.mapper_na_to_val[i] = 0
        for i in self.categorical_features:
            self.mapper_na_to_val[i]='Not present'

        df = df.fillna(value= self.mapper_na_to_val)
        return df

    def encoding(self,df):
        self.categorical_mapper = {}
        for i in self.categorical_features:
            self.categorical_mapper[i] = dict(df.groupby(i)['SalePrice'].mean())
            df[i] = df.groupby(i)['SalePrice'].transform('mean')
            if 'Not present' not in self.categorical_mapper[i].keys():
                self.categorical_mapper[i]['Not present'] = df['SalePrice'].median()
        return df

    def training(self,df):
        """"
        Drop columns if required
        """
        self.scaler_X, self.scaler_Y = MinMaxScaler(), MinMaxScaler()

        y = df[[self.target]]
        X = df.drop([self.target] + self.dropping_cols ,axis=1)
        self.cols = X.columns

        X, y = self.scaler_X.fit_transform(X), self.scaler_Y.fit_transform(y)

        X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=self.test_size, random_state=42)

        # training our model

        # self.model.fit(X_train, y_train)
        # print("R2 score for training data: ", self.model.score(X_train, y_train))
        # print("R2 score for testing data: ", self.model.score(X_test, y_test))

        # GridSearchCV
        self.grid_search = GridSearchCV(self.model, self.params, cv=self.cv)
        self.grid_search.fit(X_train, y_train)
        print("Best parameters: ")
        print(self.grid_search.best_params_)
        print("R2 score for training data: ", self.grid_search.score(X_train, y_train))
        print("R2 score for testing data: ", self.grid_search.score(X_test, y_test))


    def prediction(self,df):
        x = self.preprocess(df).replace(self.categorical_mapper).drop(self.dropping_cols, axis=1)

        self.predicted_data['Id'] = df['Id']
        # self.predicted_data[self.target] = self.scaler_Y.inverse_transform(
        #     self.model.predict(self.scaler_X.transform(x))
        #     )
        self.predicted_data[self.target] = self.scaler_Y.inverse_transform(
            self.grid_search.predict(self.scaler_X.transform(x))#.reshape(-1,1)
            )
        self.predicted_data.to_csv(self.saved_file_name,index=False)

    def pipeline(self):
        self.training(self.encoding(self.preprocess(self.train)))
        self.prediction(self.test)

In [12]:
prediction = HousePricePrediction()

Best parameters: 
{'alpha': 1.0, 'random_state': 42, 'solver': 'saga', 'tol': 0.1}
R2 score for training data:  0.8625711656151438
R2 score for testing data:  0.8615511038267716
