# DATA PREPARATION

## Step 1 - Loading the Required Libraries and Modules

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from math import sqrt
from sklearn.preprocessing import LabelEncoder, LabelBinarizer, StandardScaler, OneHotEncoder, PolynomialFeatures, MinMaxScaler, RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression, SGDClassifier, Ridge, Lasso, ElasticNet
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, learning_curve
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR, SVC
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.datasets import load_digits

## Step 2 - Reading the Data and performing Basic Data Checks

In [2]:
df = pd.read_csv('data_cleaning_analysis.csv')
print(df.shape)
df.describe()

(21613, 24)


Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,...,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,year,month,day,year_rate
count,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,...,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0
mean,540088.1,3.370842,2.114757,2079.899736,15106.97,1.494309,0.007542,0.234303,3.40943,7.656873,...,84.402258,98077.939805,47.560053,-122.213896,1986.552492,12768.455652,2014.322954,6.574423,15.688197,0.677046
std,367127.2,0.930062,0.770163,918.440897,41420.51,0.539989,0.086517,0.766318,0.650743,1.175459,...,401.67924,53.505026,0.138564,0.140828,685.391304,27304.179631,0.467616,3.115308,8.635063,0.467616
min,75000.0,0.0,0.0,290.0,520.0,1.0,0.0,0.0,1.0,1.0,...,0.0,98001.0,47.1559,-122.519,399.0,651.0,2014.0,1.0,1.0,0.0
25%,321950.0,3.0,1.75,1427.0,5040.0,1.0,0.0,0.0,3.0,7.0,...,0.0,98033.0,47.471,-122.328,1490.0,5100.0,2014.0,4.0,8.0,0.0
50%,450000.0,3.0,2.25,1910.0,7618.0,1.5,0.0,0.0,3.0,7.0,...,0.0,98065.0,47.5718,-122.23,1840.0,7620.0,2014.0,6.0,16.0,1.0
75%,645000.0,4.0,2.5,2550.0,10688.0,2.0,0.0,0.0,4.0,8.0,...,0.0,98118.0,47.678,-122.125,2360.0,10083.0,2015.0,9.0,23.0,1.0
max,7700000.0,33.0,8.0,13540.0,1651359.0,3.5,1.0,4.0,5.0,13.0,...,2015.0,98199.0,47.7776,-121.315,6210.0,871200.0,2015.0,12.0,31.0,1.0


#### Ne garder que les paramètres demandés à l'utilisateur

In [3]:
df = df.drop(['date', 'nb_days', 'year_rate', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'year', 'month', 'day'], axis=1)

In [4]:
df

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,yr_built,yr_renovated,zipcode
0,221900.0,3,1.00,1180,5650,1.0,0,0,3,7,1180,1955,0,98178
1,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,1951,1991,98125
2,180000.0,2,1.00,770,10000,1.0,0,0,3,6,770,1933,0,98028
3,604000.0,4,3.00,1960,5000,1.0,0,0,5,7,1050,1965,0,98136
4,510000.0,3,2.00,1680,8080,1.0,0,0,3,8,1680,1987,0,98074
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21608,360000.0,3,2.50,1530,1131,3.0,0,0,3,8,1530,2009,0,98103
21609,400000.0,4,2.50,2310,5813,2.0,0,0,3,8,2310,2014,0,98146
21610,402101.0,2,0.75,1020,1350,2.0,0,0,3,7,1020,2009,0,98144
21611,400000.0,3,2.50,1600,2388,2.0,0,0,3,8,1600,2004,0,98027


## Step 3 - Creating the Training and Test Datasets  

### Diviser data set en training set et test set

In [5]:
# Séparer la target (i.e. price) des autres valeurs (X)
X = df.drop('price', axis=1)
y = df.price

In [6]:
# Split le dataset en spécifiant le pourcentage de data à garder dans le test dataset, ici 20%
X_train, X_test, y_train, y_test= train_test_split(X,y, test_size=0.20)

## Step 4 - Creating Arrays for the Features and the Response Variable

### Convertir les variables catégorielles en objets

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         21613 non-null  float64
 1   bedrooms      21613 non-null  int64  
 2   bathrooms     21613 non-null  float64
 3   sqft_living   21613 non-null  int64  
 4   sqft_lot      21613 non-null  int64  
 5   floors        21613 non-null  float64
 6   waterfront    21613 non-null  int64  
 7   view          21613 non-null  int64  
 8   condition     21613 non-null  int64  
 9   grade         21613 non-null  int64  
 10  sqft_above    21613 non-null  int64  
 11  yr_built      21613 non-null  int64  
 12  yr_renovated  21613 non-null  int64  
 13  zipcode       21613 non-null  int64  
dtypes: float64(3), int64(11)
memory usage: 2.3 MB


In [8]:
# Conversion zipcode en str
df['zipcode'] = df['zipcode'].astype(str)

In [9]:
# Conversion view en str
df['view'] = df['view'].astype(str)

In [10]:
# Conversion condition en str
df['condition'] = df['condition'].astype(str)

In [11]:
# Conversion grade en str
df['grade'] = df['grade'].astype(str)

In [12]:
# Conversion bedrooms en str
df['bedrooms'] = df['bedrooms'].astype(str)

In [13]:
# Conversion bathrooms en str
df['bathrooms'] = df['bathrooms'].astype(str)

In [14]:
# Conversion waterfront en str
df['waterfront'] = df['waterfront'].astype(str)

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         21613 non-null  float64
 1   bedrooms      21613 non-null  object 
 2   bathrooms     21613 non-null  object 
 3   sqft_living   21613 non-null  int64  
 4   sqft_lot      21613 non-null  int64  
 5   floors        21613 non-null  float64
 6   waterfront    21613 non-null  object 
 7   view          21613 non-null  object 
 8   condition     21613 non-null  object 
 9   grade         21613 non-null  object 
 10  sqft_above    21613 non-null  int64  
 11  yr_built      21613 non-null  int64  
 12  yr_renovated  21613 non-null  int64  
 13  zipcode       21613 non-null  object 
dtypes: float64(2), int64(5), object(7)
memory usage: 2.3+ MB


### Sélection des colonnes à encoder et à standardiser

In [16]:
# Sélection des colonnes numériques
num_col = list(X.select_dtypes(include=[float, int]).columns)

# Sélection des colonnes catégorielles
cat_col = list(X.select_dtypes(include=[object]).columns)

In [17]:
num_col

['bedrooms',
 'bathrooms',
 'sqft_living',
 'sqft_lot',
 'floors',
 'waterfront',
 'view',
 'condition',
 'grade',
 'sqft_above',
 'yr_built',
 'yr_renovated',
 'zipcode']

In [19]:
cat_col

[]

### Encodage de chaque classe de la variable y en une valeur numérique

In [None]:
# ou preprocessing
my_col_trans = ColumnTransformer([
    ("one_hot", OneHotEncoder(), cat_col),
    ("scaling", StandardScaler(), num_col)
])

#### Pour les variables qui ont un lien hiérarchique

In [None]:
# For the exercise purpose... Because it fits strings better
encoder = LabelEncoder()

# Fonctionne avec le y et non le X càd plusieurs variables. Dans ce cas, utiliser OrdinalEncoding(X)
encoder.fit(y)

In [None]:
encoder.classes_

In [None]:
encoder.transform(y)
# ou directement encoder.fit_transform(y)

In [None]:
# Pour décoder les données
encoder.inverse_transform(np.array([0,0,2,1]))

#### Pour les variables catégorielles, i.e. qui n'ont pas de lien hiérarchique

In [None]:
# Créer une matrix compressée
encoder = LabelBinarizer(sparse_output=True)
encoder.fit_transform(y)

In [None]:
my_pipe=make_pipeline(my_col_trans, LinearRegression(), PolynomialFeatures())

## Step 5 - Build, Predict and Evaluate the Regression Model

## Normalisation

Mettre les valeurs quantitatives sur la même échelle, ce qui facilite l'apprentissage des modèles de Machine Learning

In [None]:
# Transformer chaque variable pour qu'elle soit comprise entre 0 et 1
scaler = MinMaxScaler()
scaler.fit_transform(X)

In [None]:
# Transformer les variables pour qu'elles aient une moyenne égale à 0 et un écart type égal à 1
scaler = StandardScaler()
scaler.fit_transform(X)

!!! Attention aux outliers. Si présents, utiliser Robuste 

In [None]:
scaler = RobustScaler()
scaler.fit_transform(X)

## Feature Engineering - Création de polynômes

In [None]:

# X_poly = PolynomialFeatures(3).fit_transform(X)

# model = LinearRegression().fit(X_poly, y)
# y_pred =model.predict(X_poly)

# plt.scatter(X, y)
# plt.plot(X, y_pred, c='r', lw=3)


# Normaliser variables après avoir utilisé les polynomialFeatures

## Linear Regression

In [None]:
# Instantiate the algorithm 
lr = LinearRegression()

# Fits the model on the training set
lr.fit(X_train, y_train)

#### Make the predictions

In [None]:
#predicts on the training set
pred_train_lr= lr.predict(X_train)

#prints the evaluation metrics - RMSE and R-squared - on the training set
print(np.sqrt(mean_squared_error(y_train,pred_train_lr)))
print(r2_score(y_train, pred_train_lr))

#predicts on the training set
pred_test_lr= lr.predict(X_test)

#prints the evaluation metrics - RMSE and R-squared - on the training set
print(np.sqrt(mean_squared_error(y_test,pred_test_lr))) 
print(r2_score(y_test, pred_test_lr))

The above output shows that the RMSE, one of the two evaluation metrics, is 194361 thousand for train data and 180996 thousand for test data. On the other hand, R-squared value is 72 percent for train data and 71.8 percent for test data, which is a good performance. 

## Ridge Regression

In [None]:
rr = Ridge(alpha=0.01)

rr.fit(X_train, y_train) 

pred_train_rr= rr.predict(X_train)

print(np.sqrt(mean_squared_error(y_train,pred_train_rr)))

print(r2_score(y_train, pred_train_rr))


pred_test_rr= rr.predict(X_test)

print(np.sqrt(mean_squared_error(y_test,pred_test_rr))) 

print(r2_score(y_test, pred_test_rr))

The above output shows that the RMSE and R-squared values for the Ridge Regression model on the training data is 975 thousand and 86.7 percent, respectively. For the test data, the result for these metrics is 1017 thousand and 84 percent, respectively.


## Lasso Regression

In [None]:
model_lasso = Lasso(alpha=0.01)

model_lasso.fit(X_train, y_train) 

pred_train_lasso= model_lasso.predict(X_train)

print(np.sqrt(mean_squared_error(y_train,pred_train_lasso)))

print(r2_score(y_train, pred_train_lasso))


pred_test_lasso= model_lasso.predict(X_test)

print(np.sqrt(mean_squared_error(y_test,pred_test_lasso))) 

print(r2_score(y_test, pred_test_lasso))

The above output shows that the RMSE and R-squared values for the Lasso Regression model on the training data is 971 thousand and 86.7 percent, respectively.

The results for these metrics on the test data is 1019 thousand and 84 percent, respectively. Lasso Regression can also be used for feature selection because the coeﬃcients of less important features are reduced to zero. 

## ElasticNet Regression

In [None]:
model_enet = ElasticNet(alpha = 0.01)

model_enet.fit(X_train, y_train) 

pred_train_enet= model_enet.predict(X_train)

print(np.sqrt(mean_squared_error(y_train,pred_train_enet)))

print(r2_score(y_train, pred_train_enet))


pred_test_enet= model_enet.predict(X_test)

print(np.sqrt(mean_squared_error(y_test,pred_test_enet)))

print(r2_score(y_test, pred_test_enet))

The above output shows that the RMSE and R-squared value for the ElasticNet Regression model on the training data is 1352 thousand and 74 percent, respectively. The results for these metrics on the test data is 1379 thousand and 71 percent, respectively. 

## Création de la pipeline (ou chaîne de transformation)

Avantages:  
1. simple à utiliser  
2. sécurisé (évite d'avoir des fuites de données ou des données mal transformées)  
3. permet de faire des cross-validation

In [None]:
model = make_pipeline(StandardScaler(), SGDClassifier())

model.fit(X_train, y_train)
model.predict(X_test)

In [None]:
# Trouver les meilleurs paramètres de la pipeline
model = make_pipeline(PolynomialFeatures(), StandardScaler(), SGDClassifier(random_state=0))
params = {
    'polynomialfeatures__degree' : [2, 3, 4],
    'sgdclassifier__penalty' : ['l1', 'l2']
}

grid = GridSearchCV(model, param_grid=params, cv=4)

grid.fit(X_train, y_train)

In [None]:
grid.best_params_

In [None]:
grid.score(X_test, y_test)

In [None]:
my_pipe.fit(X_train, y_train)

## Courbes d'apprentissage

In [None]:
#Avec l'estimator

X, y = load_digits(return_X_y=True)
estimator = SVC(gamma=0.001)

train_sizes, train_scores, test_scores, fit_times, _ = learning_curve(estimator, X, y, cv=30,return_times=True)

plt.plot(train_sizes,np.mean(train_scores,axis=1))

In [None]:
# Avec Ridge Regression (rr)

X, y = load_digits(return_X_y=True)
estimator = SVC(gamma=0.001)

train_sizes, train_scores, test_scores, fit_times, _ = learning_curve(rr, X, y, cv=30,return_times=True)

plt.plot(train_sizes,np.mean(train_scores,axis=1))

## Résultat

In [None]:
my_pipe.score(X_test, y_test)

 The performance of the models is summarized below:  
 
    Linear Regression Model: Test set RMSE of 1019 thousand and R-square of 83.96 percent.  

    Ridge Regression Model: Test set RMSE of 1017 thousand and R-square of 84.02 percent.  

    Lasso Regression Model: Test set RMSE of 1019 thousand and R-square of 83.96 percent.  
    
    ElasticNet Regression Model: Test set RMSE of 1379 thousand and R-square of 70.62 percent.  

The ElasticNet Regression model is performing the worst. All the other regression models are performing better with a decent R-squared and stable RMSE values. The most ideal result would be an RMSE value of zero and R-squared value of 1, but that's almost impossible in real economic datasets.  

There are other iterations that can be done to improve model performance. We have assigned the value of alpha to be 0.01, but this can be altered by hyper parameter tuning to arrive at the optimal alpha value. Cross-validation can also be tried along with feature selection techniques. However, that is not covered in this guide which was aimed at enabling individuals to understand and implement the various Linear Regression models using the scikit-learn library. 