In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge,Lasso,ElasticNet
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import GridSearchCV,learning_curve


In [2]:
X_train = pd.read_csv('csv/X_train.csv')
X_test = pd.read_csv('csv/X_test.csv')
y_train = pd.read_csv('csv/y_train.csv')
y_test = pd.read_csv('csv/y_test.csv') 

# Algorithme de régression linéaire Ridge

### Recherche du meilleur alpha

In [3]:

modelRid = make_pipeline( PolynomialFeatures(degree=2,interaction_only=False),GridSearchCV(Ridge(),param_grid={'alpha': [0,0.01,0.05,0.1,1,10,100,300,400,500,1000,10000]},
                                 scoring='r2',
                                 refit=True))

for exp in range(2):
    affinage = modelRid.fit(X_train,y_train)[1].best_params_["alpha"]
    liste_param = [param for param in np.arange(affinage - (affinage/10**(exp+1)),affinage + (affinage/10**(exp+1)), 10**(-exp))]
    modelRid = make_pipeline( PolynomialFeatures(degree=2,interaction_only=False),GridSearchCV(Ridge(),param_grid={'alpha': liste_param},
                                 scoring='r2',
                                 refit=True))

best_alpha = modelRid.fit(X_train,y_train)[1].best_params_["alpha"]
best_alpha


1109.90999999998

In [4]:
best_alpha=1109.90999999998

# Utilisation du model Ridge

In [5]:
modelRid = make_pipeline( PolynomialFeatures(degree=2),Ridge(alpha=best_alpha))

## Courbe d'apprentissage

In [6]:
# X = pd.concat([X_train,X_test])
# y = pd.concat([y_train,y_test])
# train_sizes, train_scores, test_scores, fit_times, _ = learning_curve(modelRid,X,y, cv=30,return_times=True)

# plt.plot(train_sizes,np.mean(train_scores,axis=1),'r')

# plt.show()


## Prédiction

In [7]:
modelRid.fit(X_train,y_train)
modelRid.score(X_train,y_train)

0.7604421854481236

In [8]:
modelRid.score(X_test,y_test)

0.7550741828118339

In [9]:
y_prediction = modelRid.predict(X_test)
df_prediction = pd.DataFrame(y_prediction,columns=["prediction"])

In [10]:
df_prediction["resultat"] = y_test['price']

In [11]:
df_prediction["diff"]= ((df_prediction["resultat"] - df_prediction["prediction"])**2)**(1/2)

In [12]:
df_prediction.describe()

Unnamed: 0,prediction,resultat,diff
count,4320.0,4320.0,4320.0
mean,538966.9,541675.7,127921.9
std,317321.1,363904.1,126754.9
min,83471.18,84000.0,35.79502
25%,360206.7,325000.0,49996.58
50%,445573.9,455000.0,98895.76
75%,609505.3,651325.0,161836.3
max,6212519.0,7700000.0,1487481.0


In [13]:
df_prediction

Unnamed: 0,prediction,resultat,diff
0,3.688672e+05,338900.0,29967.174306
1,4.362454e+05,543000.0,106754.645173
2,4.411790e+05,565000.0,123821.043477
3,1.099474e+06,880000.0,219473.521872
4,6.113648e+05,562000.0,49364.775793
...,...,...,...
4315,4.852960e+05,677900.0,192603.980864
4316,3.516158e+05,192950.0,158665.835381
4317,4.276354e+05,330000.0,97635.404842
4318,3.752362e+05,379900.0,4663.802787


In [14]:
# modelLasso = make_pipeline( PolynomialFeatures(2),GridSearchCV(Lasso(),param_grid={'alpha': [0.0001,0.001,0.01,0.05,0.1,0.5,1],"tol":[0.0001,0.001,0.01,0.05,0.1,0.5,1]},
#                                                                 cv=2,refit=True))
# modelLasso.fit(X_train,y_train)

In [15]:
# modelLasso.score(X_test,y_test)

In [16]:
# prediction = modelLasso.predict(X_test)
# prediction

In [17]:
# modelEN = make_pipeline( PolynomialFeatures(2),GridSearchCV(ElasticNet(),param_grid={'alpha': [0.0001,0.001,0.01,0.05,0.1,0.5,1],"tol":[0.0001,0.001,0.01,0.05,0.1,0.5,1]},
#                                                                 cv=2,refit=True))
# modelEN.fit(X_train,y_train)
# print("")

In [18]:
# modelEN.score(X_test,y_test)

In [19]:
# prediction = modelEN.predict(X_test)
# prediction

# LinearRegression

In [20]:
modelLR = make_pipeline( PolynomialFeatures(1),LinearRegression())
modelLR.fit(X_train,y_train)

Pipeline(steps=[('polynomialfeatures', PolynomialFeatures(degree=1)),
                ('linearregression', LinearRegression())])

In [21]:
modelLR.score(X_test,y_test)

0.7138792459008029

In [22]:
y_lr_prediction = modelLR.predict(X_test)
y_lr_prediction

array([[334941.02072795],
       [437327.21115209],
       [430469.05269508],
       ...,
       [410063.0346221 ],
       [393577.59207034],
       [337899.1070047 ]])

# Methode naif

In [23]:
modelDummy = DummyRegressor()
modelDummy.fit(X_train,y_train)
modelDummy.score(X_test,y_test)

-2.5762973467680084e-05

In [24]:
prediction = modelDummy.predict(X_test)
prediction

array([539828.85766381, 539828.85766381, 539828.85766381, ...,
       539828.85766381, 539828.85766381, 539828.85766381])