In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge,Lasso,ElasticNet
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import GridSearchCV,learning_curve


In [26]:
X_train = pd.read_csv('csv/X_train.csv')
X_test = pd.read_csv('csv/X_test.csv')
y_train = pd.read_csv('csv/y_train.csv')
y_test = pd.read_csv('csv/y_test.csv') 

# Algorithme de régression linéaire Ridge

### Recherche du meilleur alpha

In [27]:

modelRid = make_pipeline( PolynomialFeatures(degree=2,interaction_only=False),GridSearchCV(Ridge(),param_grid={'alpha': [0,0.01,0.05,0.1,1,10,100,300,400,500,1000,10000]},
                                 scoring='r2',
                                 refit=True))

for exp in range(2):
    affinage = modelRid.fit(X_train,y_train)[1].best_params_["alpha"]
    liste_param = [param for param in np.arange(affinage - (affinage/10**(exp+1)),affinage + (affinage/10**(exp+1)), 10**(-exp))]
    modelRid = make_pipeline( PolynomialFeatures(degree=2,interaction_only=False),GridSearchCV(Ridge(),param_grid={'alpha': liste_param},
                                 scoring='r2',
                                 refit=True))

best_alpha = modelRid.fit(X_train,y_train)[1].best_params_["alpha"]
best_alpha


1109.90999999998

In [28]:
# best_alpha=1109.90999999998

# Utilisation du model Ridge

In [29]:
modelRid = make_pipeline( PolynomialFeatures(degree=2),Ridge(alpha=best_alpha))

## Courbe d'apprentissage

In [30]:
# X = pd.concat([X_train,X_test])
# y = pd.concat([y_train,y_test])
# train_sizes, train_scores, test_scores, fit_times, _ = learning_curve(modelRid,X,y, cv=30,return_times=True)

# plt.plot(train_sizes,np.mean(train_scores,axis=1),'r')

# plt.show()


## Prédiction

In [31]:
modelRid.fit(X_train,y_train)
modelRid.score(X_train,y_train)

0.8632839633237167

In [32]:
modelRid.score(X_test,y_test)

0.8449631816176739

In [33]:
y_prediction = modelRid.predict(X_test)
df_prediction = pd.DataFrame(y_prediction,columns=["prediction"])

In [34]:
df_prediction["resultat"] = y_test['price']

In [35]:
df_prediction["diff"]= ((df_prediction["resultat"] - df_prediction["prediction"])**2)**(1/2)

In [36]:
df_prediction.describe()

Unnamed: 0,prediction,resultat,diff
count,4320.0,4320.0,4320.0
mean,537589.0,541675.7,92260.24
std,326928.8,363904.1,109622.1
min,45493.44,84000.0,1.447045
25%,354349.3,325000.0,29819.32
50%,458377.3,455000.0,64024.53
75%,621981.0,651325.0,115667.0
max,5363575.0,7700000.0,2336425.0


In [37]:
df_prediction

Unnamed: 0,prediction,resultat,diff
0,3.386592e+05,338900.0,240.763990
1,4.385706e+05,543000.0,104429.376796
2,4.902482e+05,565000.0,74751.845608
3,1.232500e+06,880000.0,352500.389527
4,5.730274e+05,562000.0,11027.422898
...,...,...,...
4315,5.714627e+05,677900.0,106437.292984
4316,1.778800e+05,192950.0,15069.973109
4317,2.214689e+05,330000.0,108531.144843
4318,4.150892e+05,379900.0,35189.238884


In [38]:
# modelLasso = make_pipeline( PolynomialFeatures(2),GridSearchCV(Lasso(),param_grid={'alpha': [0.0001,0.001,0.01,0.05,0.1,0.5,1],"tol":[0.0001,0.001,0.01,0.05,0.1,0.5,1]},
#                                                                 cv=2,refit=True))
# modelLasso.fit(X_train,y_train)

In [39]:
# modelLasso.score(X_test,y_test)

In [40]:
# prediction = modelLasso.predict(X_test)
# prediction

In [41]:
# modelEN = make_pipeline( PolynomialFeatures(2),GridSearchCV(ElasticNet(),param_grid={'alpha': [0.0001,0.001,0.01,0.05,0.1,0.5,1],"tol":[0.0001,0.001,0.01,0.05,0.1,0.5,1]},
#                                                                 cv=2,refit=True))
# modelEN.fit(X_train,y_train)
# print("")

In [42]:
# modelEN.score(X_test,y_test)

In [43]:
# prediction = modelEN.predict(X_test)
# prediction

# LinearRegression

In [44]:
modelLR = make_pipeline( PolynomialFeatures(1),LinearRegression())
modelLR.fit(X_train,y_train)

Pipeline(steps=[('polynomialfeatures', PolynomialFeatures(degree=1)),
                ('linearregression', LinearRegression())])

In [45]:
modelLR.score(X_test,y_test)

0.7798584064022618

In [46]:
y_lr_prediction = modelLR.predict(X_test)
y_lr_prediction

array([[205908.7705559 ],
       [435973.10619321],
       [483549.11303339],
       ...,
       [201365.3557494 ],
       [470367.89682547],
       [445531.55070209]])

# Methode naif

In [47]:
modelDummy = DummyRegressor()
modelDummy.fit(X_train,y_train)
modelDummy.score(X_test,y_test)

-2.5762973467680084e-05

In [48]:
prediction = modelDummy.predict(X_test)
prediction

array([539828.85766381, 539828.85766381, 539828.85766381, ...,
       539828.85766381, 539828.85766381, 539828.85766381])