In [86]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge,Lasso,ElasticNet
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import GridSearchCV,learning_curve


In [87]:
X_train = pd.read_csv('csv/X_train.csv')
X_test = pd.read_csv('csv/X_test.csv')
y_train = pd.read_csv('csv/y_train.csv')
y_test = pd.read_csv('csv/y_test.csv') 

# Algorithme de régression linéaire Ridge

### Recherche du meilleur alpha

In [88]:

modelRid = make_pipeline( PolynomialFeatures(degree=2,interaction_only=False),GridSearchCV(Ridge(),param_grid={'alpha': [0,0.01,0.05,0.1,1,10,100,300,400,500,1000,10000]},
                                 scoring='r2',
                                 refit=True))

for exp in range(2):
    affinage = modelRid.fit(X_train,y_train)[1].best_params_["alpha"]
    liste_param = [param for param in np.arange(affinage - (affinage/10**(exp+1)),affinage + (affinage/10**(exp+1)), 10**(-exp))]
    modelRid = make_pipeline( PolynomialFeatures(degree=2,interaction_only=False),GridSearchCV(Ridge(),param_grid={'alpha': liste_param},
                                 scoring='r2',
                                 refit=True))

best_alpha = modelRid.fit(X_train,y_train)[1].best_params_["alpha"]
best_alpha


1109.90999999998

In [89]:
# best_alpha=1109.90999999998

# Utilisation du model Ridge

In [90]:
modelRid = make_pipeline( PolynomialFeatures(degree=2),Ridge(alpha=best_alpha))

## Courbe d'apprentissage

In [91]:
# X = pd.concat([X_train,X_test])
# y = pd.concat([y_train,y_test])
# train_sizes, train_scores, test_scores, fit_times, _ = learning_curve(modelRid,X,y, cv=30,return_times=True)

# plt.plot(train_sizes,np.mean(train_scores,axis=1),'r')

# plt.show()


## Prédiction

In [92]:
modelRid.fit(X_train,y_train)
modelRid.score(X_train,y_train)

0.8632839633237167

In [93]:
modelRid.score(X_test,y_test)

0.8449631816176739

In [94]:
y_prediction = modelRid.predict(X_test)
df_prediction = pd.DataFrame(y_prediction,columns=["prediction"])

In [95]:
df_prediction["resultat"] = y_test['price']

In [96]:
df_prediction["diff"]= ((df_prediction["resultat"] - df_prediction["prediction"])**2)**(1/2)

In [97]:
df_prediction.describe()

Unnamed: 0,prediction,resultat,diff
count,4320.0,4320.0,4320.0
mean,537589.0,541675.7,92260.24
std,326928.8,363904.1,109622.1
min,45493.44,84000.0,1.447045
25%,354349.3,325000.0,29819.32
50%,458377.3,455000.0,64024.53
75%,621981.0,651325.0,115667.0
max,5363575.0,7700000.0,2336425.0


In [119]:
df_prediction[df_prediction["diff"] < 10000].sort_v

Unnamed: 0,prediction,resultat,diff
0,3.386592e+05,338900.0,240.763990
7,5.487137e+05,549995.0,1281.321680
21,1.063835e+06,1070000.0,6164.915900
26,3.378044e+05,335000.0,2804.353923
35,6.285770e+05,635000.0,6422.975397
...,...,...,...
4179,2.293868e+05,233000.0,3613.202142
4248,4.569766e+05,450000.0,6976.606961
4270,3.742651e+05,383000.0,8734.867254
4303,1.616883e+06,1610000.0,6883.054712


In [99]:
# modelLasso = make_pipeline( PolynomialFeatures(2),GridSearchCV(Lasso(),param_grid={'alpha': [0.0001,0.001,0.01,0.05,0.1,0.5,1],"tol":[0.0001,0.001,0.01,0.05,0.1,0.5,1]},
#                                                                 cv=2,refit=True))
# modelLasso.fit(X_train,y_train)

In [100]:
# modelLasso.score(X_test,y_test)

In [101]:
# prediction = modelLasso.predict(X_test)
# prediction

In [102]:
# modelEN = make_pipeline( PolynomialFeatures(2),GridSearchCV(ElasticNet(),param_grid={'alpha': [0.0001,0.001,0.01,0.05,0.1,0.5,1],"tol":[0.0001,0.001,0.01,0.05,0.1,0.5,1]},
#                                                                 cv=2,refit=True))
# modelEN.fit(X_train,y_train)
# print("")

In [103]:
# modelEN.score(X_test,y_test)

In [104]:
# prediction = modelEN.predict(X_test)
# prediction

# LinearRegression

In [110]:
modelLR = make_pipeline( PolynomialFeatures(2),LinearRegression())
modelLR.fit(X_train,y_train)

Pipeline(steps=[('polynomialfeatures', PolynomialFeatures()),
                ('linearregression', LinearRegression())])

In [111]:
modelLR.score(X_test,y_test)

0.7866050697059064

In [112]:
y_lr_prediction = modelLR.predict(X_test)
y_lr_prediction

array([[319818.6171875 ],
       [433889.87109375],
       [489079.6015625 ],
       ...,
       [178478.3984375 ],
       [434084.765625  ],
       [426530.53515625]])

# Methode naif

In [108]:
modelDummy = DummyRegressor()
modelDummy.fit(X_train,y_train)
modelDummy.score(X_test,y_test)

-2.5762973467680084e-05

In [109]:
prediction = modelDummy.predict(X_test)
prediction

array([539828.85766381, 539828.85766381, 539828.85766381, ...,
       539828.85766381, 539828.85766381, 539828.85766381])