# Multiple Linear Regression

In [134]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [135]:
data = pd.read_csv("../../../Data/Dataset_zero_numeric_missing.csv",sep=";")

In [136]:
data.shape

(9445, 11)

### Get feature categories

In [137]:
def categorisation(data,target="price"):
    distance = []
    numerique = []
    categorique = []
    for col in data.columns:
        if data[col].dtypes == np.object:
            try:
                if str(type(eval(data[col][data[col].first_valid_index()]))) == "<class 'list'>":
                    distance.append(col)
                else:
                    categorique.append(col)
            except:
                categorique.append(col)
                
            
        else :
            if col != target:
                numerique.append(col)
                
    return dict({"numerique":numerique,"categorique":categorique,"distance":distance})

feat_cat = categorisation(data)

feat_cat

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if data[col].dtypes == np.object:


{'numerique': ['bathrooms', 'bedrooms', 'sampling'],
 'categorique': ['name', 'province', 'city', 'address', 'type_of_property'],
 'distance': ['education', 'transport_and_public_services']}

### Dropping name, address and city because there is high cardinality (cf EDA)

In [138]:
data= data.drop(['name','address','city'],axis=1)
data.columns

Index(['price', 'province', 'type_of_property', 'bathrooms', 'bedrooms',
       'education', 'transport_and_public_services', 'sampling'],
      dtype='object')

### Transform distance features to lenght of list

In [139]:
class transfo_distance:
    def __init__(self,data,features=None):
        self.data = data
        self.method = None
        ### Get features
        self.distance=[]
        if features is None:
            for col in data.columns:
                if data[col].dtypes == np.object:
                    try:
                        if str(type(eval(data[col][data[col].first_valid_index()]))) == "<class 'list'>":
                            self.distance.append(col)
                    except:
                        pass
        else:
            distance=features
        
        ###Transformations : "mean", "median", "max", "min", "len"
    def transformation(self,method):
        self.method = method
        for feat in self.distance:
            serie = []
            for i in data[feat]:
                if str(i) != "nan":
                    serie.append(self.checker(i))
                else :
                    serie.append(np.nan)
            self.data[feat] = serie
            
    def checker(self,value):
        if self.method=="mean":
            return (np.array(eval(value))).mean()
        elif self.method =="median":
            return (np.array(eval(value))).mediane()
        elif self.method =="min":
            return (np.array(eval(value))).min()
        elif self.method =="max":
            return (np.array(eval(value))).max()
        elif self.method =="len":
            return float(len(eval(value)))
        else:
            raise('choose correct method : "mean", "median", "max", "min", "len"')
    
    def get_data(self,method="mean"):
        self.transformation(method)
        return self.data

In [140]:
trans_dist = transfo_distance(data)
data = trans_dist.get_data(method="len")
data.shape

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if data[col].dtypes == np.object:


(9445, 8)

### Check length of distances vs price

In [None]:
def pairplot(x, y, **kwargs):
    ax = plt.gca()
    ts = pd.DataFrame({'time': x, 'val': y})
    ts = ts.groupby('time').mean()
    ts.plot(ax=ax)
    plt.xticks(rotation=90)
    
feats = feat_cat["distance"]
f = pd.melt(data, id_vars=['price'], value_vars=feats)
g = sns.FacetGrid(f, col="variable",  col_wrap=2, sharex=False, sharey=False, size=6)
g = g.map(pairplot, "value", "price")

In [141]:
data= data[~(data['type_of_property'] == 'Vacant Land / Plot')]
data.shape

(9444, 8)

In [None]:
feat_cat = categorisation(data)
categoric_f = feat_cat['categorique']

def boxplot(x, y, **kwargs):
    sns.boxplot(x=x, y=y)
    x=plt.xticks(rotation=90)
f = pd.melt(data, id_vars=['price'], value_vars=categoric_f)
g = sns.FacetGrid(f, col="variable",  col_wrap=2, sharex=False, sharey=False, size=5)
g = g.map(boxplot, "value", "price")

### Transform categorical data using dummies

In [142]:
data= pd.get_dummies(data)

data.columns

Index(['price', 'bathrooms', 'bedrooms', 'education',
       'transport_and_public_services', 'sampling', 'province_eastern-cape',
       'province_free-state', 'province_gauteng', 'province_kwazulu-natal',
       'province_limpopo', 'province_mpumalanga', 'province_north-west',
       'province_northern-cape', 'province_western-cape',
       'type_of_property_Apartment / Flat', 'type_of_property_House',
       'type_of_property_Townhouse'],
      dtype='object')

### Transform categorical data using label encoding

In [None]:
def label_encoding(data,labels):
    for label in labels:
        values= data[label].unique()
        for value in values:
            code= len(data[data[label]==value])
            data.loc[data[label]==value,label]=code
    return data

feat_cat = categorisation(data)
categoric_f = feat_cat['categorique']

data= label_encoding(data,feat_cat['categorique'])

data.head()


### Target encoding to transform categorical data

In [None]:
def encode(frame, feature):
    ordering = pd.DataFrame()
    ordering['val'] = frame[feature].unique()
    ordering.index = ordering.val
    ordering['spmean'] = frame[[feature, 'price']].groupby(feature).mean()['price']
    #print(frame[[feature, 'a']].groupby(feature).mean()['a'])
    ordering = ordering.sort_values('spmean')
    ordering['ordering'] = range(1, ordering.shape[0]+1)
    #print(ordering)
    ordering = ordering['ordering'].to_dict()
    #print(ordering)
    
    for cat, o in ordering.items():
        frame.loc[frame[feature] == cat, feature+'_E'] = o

In [None]:
qual_encoded = []
for q in feat_cat["categorique"]:  
    encode(data, q)
    qual_encoded.append(q+'_E')
print(qual_encoded)

### Sampling

In [143]:
def split_data(data,target,col="sampling",features_to_keep=None):
    if features_to_keep is not None :
        features_to_keep = features_to_keep + [target]
        data = data[features_to_keep]
        
    unique = data["sampling"].unique()
    for i, v in enumerate(unique):
        if v == 1 :
            data_train = data[data[col]==v]
            y_train = data_train[target]
            X_train = data_train.drop([target,"sampling"],axis=1)
            print("train shape :",X_train.shape)
        elif v == 2 :
            data_val = data[data[col]==v]
            y_val = data_val[target]
            X_val = data_val.drop([target,"sampling"],axis=1)
            print("val shape :",X_val.shape)
        elif v == 3 :
            data_test = data[data[col]==v]
            y_test = data_test[target]
            X_test = data_test.drop([target,"sampling"],axis=1)
            print("test shape :",X_test.shape)
    
            
    return X_train, np.log(y_train), X_val, np.log(y_val), X_test, np.log(y_test)

In [144]:
X_train, y_train, X_val, y_val, X_test, y_test = split_data(data,"price")
data.shape

train shape : (6571, 16)
val shape : (1465, 16)
test shape : (1408, 16)


(9444, 18)

## Using Multipe Linear Regression 

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

In [145]:
MLR = linear_model.LinearRegression()
MLR.fit(X_train,y_train)

LinearRegression()

In [146]:
y_prediction =  MLR.predict(X_val)
y_prediction

array([13.37328227, 14.41968068, 15.50667346, ..., 14.17716729,
       15.34933505, 14.24985358])

In [147]:
rmse = np.sqrt(mean_squared_error(y_val, y_prediction))
print("The mean squared error (MSE) on test set: {:.4f}".format(rmse))

score=r2_score(y_val,y_prediction)
print('r2 socre is ',score)

The mean squared error (MSE) on test set: 0.4786
r2 socre is  0.6389365779172504
