# Regression Model to predict sales price with Lot

In [1]:
# load Library

import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

from sklearn import preprocessing


%matplotlib inline

In [2]:
Lot_Sales_DF = pd.read_csv('C:Data/Lot_Sales_DF.csv',
                               sep=',',
                               header=0,
                               index_col=0,
                               )

### Basic Linear Regression

In [3]:
# First do a Linear Regression 

from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(Lot_Sales_DF.dropna().drop('SalePrice', axis=1),
                             Lot_Sales_DF.dropna().loc[:,'SalePrice'])

# score of the model
reg.score(Lot_Sales_DF.dropna().drop('SalePrice', axis=1),
          Lot_Sales_DF.dropna().loc[:,'SalePrice'])



0.24397679484649226

#### Quantile Transformation

In [4]:
# transform the distribution to analyse with a correlation matrix
quantile_transformer = preprocessing.QuantileTransformer(output_distribution='normal',
                                                         random_state=0)

# Trasform the DF to a array 
Lot_Array = Lot_Sales_DF.dropna().drop('SalePrice', axis=1).values

# quatile trasnformation of the Lot values
Lot_Array = quantile_transformer.fit_transform(Lot_Array)

#Transform the Array in a DF
Lot_DF = pd.DataFrame(Lot_Array,
                      columns=Lot_Sales_DF.dropna().drop('SalePrice', axis=1).columns)

# Linear Regression after quantiletrasnformer

reg_quant = LinearRegression().fit(Lot_DF,
                             Lot_Sales_DF.dropna().loc[:,'SalePrice'])

# score of the model
reg_quant.score(Lot_DF,
                Lot_Sales_DF.dropna().loc[:,'SalePrice'])



0.2774542075733244

###### Small increase 

#### yeo-johnson transformation

In [5]:
# transform the distribution to analyse with a correlation matrix
pt = preprocessing.PowerTransformer(method='yeo-johnson',
                                    standardize=True)
# Trasform the DF to a array 
Lot_Array = Lot_Sales_DF.dropna().drop('SalePrice', axis=1).values

# quatile trasnformation of the Lot values
Lot_Array = pt.fit_transform(Lot_Array)

#Transform the Array in a DF
Lot_DF = pd.DataFrame(Lot_Array,
                      columns=Lot_Sales_DF.dropna().drop('SalePrice', axis=1).columns)

# Linear Regression after quantiletrasnformer

reg_yeo = LinearRegression().fit(Lot_DF,
                             Lot_Sales_DF.dropna().loc[:,'SalePrice'])

# score of the model
reg_yeo.score(Lot_DF,
              Lot_Sales_DF.dropna().loc[:,'SalePrice'])



0.24873286150900875

### Ridge regression 

In [6]:
# First Ridge regression  

from sklearn.linear_model import Ridge
ridge = Ridge(alpha=1.0).fit(Lot_Sales_DF.dropna().drop('SalePrice', axis=1),
                             Lot_Sales_DF.dropna().loc[:,'SalePrice'])

# score of the model
ridge.score(Lot_Sales_DF.dropna().drop('SalePrice', axis=1),
            Lot_Sales_DF.dropna().loc[:,'SalePrice'])

0.24384153493139193

In [7]:
# Ridge regression find best alpha 

from sklearn.linear_model import RidgeCV

ridgeCV = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1, 1e+2]).fit(Lot_Sales_DF.dropna().drop('SalePrice', axis=1),
                                                    Lot_Sales_DF.dropna().loc[:,'SalePrice'])

# score of the model
ridgeCV.score(Lot_Sales_DF.dropna().drop('SalePrice', axis=1),
              Lot_Sales_DF.dropna().loc[:,'SalePrice'])

0.24384153492321814

### ridge with quant transformation

In [8]:
# transform the distribution to analyse with a correlation matrix
quantile_transformer = preprocessing.QuantileTransformer(output_distribution='normal',
                                                         random_state=0)

# Trasform the DF to a array 
Lot_Array = Lot_Sales_DF.dropna().drop('SalePrice', axis=1).values

# quatile trasnformation of the Lot values
Lot_Array = quantile_transformer.fit_transform(Lot_Array)

#Transform the Array in a DF
Lot_DF = pd.DataFrame(Lot_Array,
                      columns=Lot_Sales_DF.dropna().drop('SalePrice', axis=1).columns)

# Ridge CV 
ridgeCV_quant = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1, 1e+2],
                        gcv_mode='auto',
                        normalize=False,
                        scoring='r2').fit(Lot_Sales_DF.dropna().drop('SalePrice', axis=1),
                                                    Lot_Sales_DF.dropna().loc[:,'SalePrice'])

# score of the model
ridgeCV_quant.score(Lot_Sales_DF.dropna().drop('SalePrice', axis=1),
                    Lot_Sales_DF.dropna().loc[:,'SalePrice'])

0.24394276876737553

#### SGDRegressor model

In [25]:
# First Ridge regression  

from sklearn.linear_model import SGDRegressor
SGDReg = SGDRegressor(penalty='elasticnet',
                      alpha=0.01,
                      l1_ratio=0.25,
                      tol=1e-4,
                      random_state=1).fit(Lot_Sales_DF.dropna().drop('SalePrice', axis=1),
                                          Lot_Sales_DF.dropna().loc[:,'SalePrice'])

# score of the model
SGDReg.score(Lot_Sales_DF.dropna().drop('SalePrice', axis=1),
            Lot_Sales_DF.dropna().loc[:,'SalePrice'])

-1.9299701119527393e+23

### Well looks like it is not this one XD

In [10]:
from sklearn.metrics import SCORERS

sorted(SCORERS.keys())

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'brier_score_loss',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'max_error',
 'mutual_info_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'v_measure_score']

In [21]:
SGDReg.get_params


<bound method BaseEstimator.get_params of SGDRegressor(alpha=0.01, average=False, early_stopping=False, epsilon=0.1,
             eta0=0.01, fit_intercept=True, l1_ratio=0.25,
             learning_rate='invscaling', loss='squared_loss', max_iter=1000,
             n_iter_no_change=5, penalty='elasticnet', power_t=0.25,
             random_state=None, shuffle=True, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)>