#### Import the models

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelBinarizer
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

#### Import the data

In [2]:
housing_df = pd.read_csv('Housing.csv')
housing_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


In [3]:
housing_df.head(2)

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished


#### Prepare the data

In [4]:
# Let us write a custom function to encode the categorical attributes.
# This will do Label Binarizer if 2 unique value is present else do dummy transformation.
def convert_to_le(df, cols):
    """
    param: df: Pass the dataframe to transform
    param: cols: Pass the list of columns to transform
    """
    dummy_li = []
    for col_val in cols:
        if df[col_val].nunique() == 2:
            df[col_val] = LabelBinarizer().fit_transform(df[col_val])
        else:
            dummy_li.append(col_val)
    try:
        df = pd.get_dummies(data=df, columns=dummy_li, drop_first=True)
        return df
    except Exception as e:
        print(f'No dummy to convert. {e}')
        return df

In [5]:
#Get categorical columns.
cat_cols = housing_df.select_dtypes(include='object').columns.values

#Transform the data
housing_df_le = convert_to_le(housing_df, cat_cols)
housing_df_le.head(2)

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,0,0
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,0,0


#### Split the data into train and test and scale it

In [6]:
X = housing_df_le.drop('price',axis=1) # Input
Y = housing_df_le['price'] # Output

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2) # 20% data for testing

In [8]:
# Perform the Min Max scaler on the independent variable
scl = MinMaxScaler()
X_train = scl.fit_transform(X_train) # Fit and transform the train data
X_test = scl.transform(X_test) # Only transform the test data

#### Apply the Statsmodel OLS method

In [9]:
# Linear regression from Statsmodel
X_train_sm = sm.add_constant(X_train) # Add the intercept
sm_linear_model = sm.OLS(Y_train, X_train_sm).fit()

In [10]:
sm_linear_model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.68
Model:,OLS,Adj. R-squared:,0.67
Method:,Least Squares,F-statistic:,68.87
Date:,"Tue, 02 Mar 2021",Prob (F-statistic):,1.3599999999999999e-95
Time:,20:40:39,Log-Likelihood:,-6642.3
No. Observations:,436,AIC:,13310.0
Df Residuals:,422,BIC:,13370.0
Df Model:,13,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.106e+06,2.23e+05,9.455,0.000,1.67e+06,2.54e+06
x1,3.295e+06,3.71e+05,8.875,0.000,2.57e+06,4.02e+06
x2,5.061e+05,3.78e+05,1.339,0.181,-2.37e+05,1.25e+06
x3,1.874e+06,2.22e+05,8.430,0.000,1.44e+06,2.31e+06
x4,1.386e+06,2.04e+05,6.781,0.000,9.84e+05,1.79e+06
x5,4.214e+05,1.49e+05,2.820,0.005,1.28e+05,7.15e+05
x6,2.538e+05,1.38e+05,1.840,0.066,-1.73e+04,5.25e+05
x7,3.142e+05,1.17e+05,2.685,0.008,8.42e+04,5.44e+05
x8,1.021e+06,2.28e+05,4.477,0.000,5.73e+05,1.47e+06

0,1,2,3
Omnibus:,55.737,Durbin-Watson:,2.158
Prob(Omnibus):,0.0,Jarque-Bera (JB):,115.99
Skew:,0.705,Prob(JB):,6.5e-26
Kurtosis:,5.097,Cond. No.,14.4


In [11]:
#Do the prediction on Statsmodel 
X_test_sm = sm.add_constant(X_test)
y_pred_sm = sm_linear_model.predict(X_test_sm)

In [12]:
# Get the R2 score.
r2_score(Y_test, y_pred_sm)

0.67734417357427

#### Apply the sklearn Linear Regression

In [13]:
sk_linear_model = LinearRegression()

In [14]:
sk_linear_model.fit(X_train, Y_train)

LinearRegression()

In [15]:
y_pred_sk = sk_linear_model.predict(X_test)

In [16]:
r2_score(Y_test, y_pred_sk)

0.6773441735742696

#### Model interpretation

In [17]:
# Get the intercept of the model
sk_linear_model.intercept_

# Get the slopes for all variables.
slope_val = [ '%.2f' % elem for elem in sk_linear_model.coef_ ]

In [18]:
slopes =  [f"{str(slope_val)+'*'+col_name}" for slope_val, col_name in zip(slope_val, X.columns.values)]
linear_regression_equation = ' + '.join(slopes) + " + " +str(sk_linear_model.intercept_)

In [19]:
print(linear_regression_equation)

3295125.11*area + 506109.44*bedrooms + 1874073.79*bathrooms + 1385770.25*stories + 421368.74*mainroad + 253819.75*guestroom + 314233.13*basement + 1021073.02*hotwaterheating + 794828.15*airconditioning + 833078.62*parking + 617166.58*prefarea + -47533.58*furnishingstatus_semi-furnished + -431147.75*furnishingstatus_unfurnished + 2105949.00546309
