In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

import warnings
warnings.filterwarnings("ignore")

In [None]:
# Données
%run base.ipynb
df = donnees.iloc[:,1:2314]

In [None]:
%%time

# Paramètres optimaux pour 100 valeurs de alpha

y = donnees["Household possessions: Bicycle"]
X =np.array(df)
sc = StandardScaler()
sc.fit(X)
X = sc.transform(X)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle=True, random_state = 0)
param_grid = [
        {'alpha': np.linspace(0,1,100), 'l1_ratio': [.1, .2, .3, .4, .5, .6, .7, .8, .9, 1]},
]


grid_search = GridSearchCV(ElasticNet(), param_grid, cv=3,
                               scoring='r2',
                               return_train_score=True)
grid_search.fit(x_train, y_train)

print(grid_search.best_estimator_)
print(grid_search.best_score_)
print(grid_search.score(x_test, y_test))


#------------------------------------------------------
alpha = grid_search.best_estimator_.alpha
l1_ratio = grid_search.best_estimator_.l1_ratio

regr = ElasticNet(random_state=0,alpha=alpha, l1_ratio=l1_ratio)
regr.fit(x_train, y_train)

print("r2_train : ")
print(100*regr.score(x_train, y_train))

y_pred = regr.predict(x_test)
y_true = y_test
print("\n r2_test : ")
print(100*r2_score(y_true, y_pred))

coef = pd.Series(regr.coef_, index = df.columns)
serie = pd.Series(coef[coef!=0])
print("Nombre de variables sélectionnées :")
print(len(serie))

In [None]:
my_df100 = donnees[["id_district", "Household possessions: Bicycle"]]
my_df100["Household possessions: Bicycle predicted"] = regr.predict(X)
my_df100[["Household possessions: Bicycle", "Household possessions: Bicycle predicted"]].corr()

In [None]:
cdr_velo = list(serie.index)
cdr_velo = pd.DataFrame({"cdr_velo":cdr_velo})
cdr_velo.to_csv("../resultats_tableaux/cdr_velo.csv",index=False)

In [None]:
%%time
r2_train = []
r2_test = []
l_alpha = []
l_serie = []
l_taille = []
l_var = []
cv = KFold(3, random_state=0, shuffle=True)
df = base_cdr.drop(columns="id_district")

v_y = "Household possessions: Bicycle"
y = donnees[v_y]
sc = StandardScaler()
X =np.array(df)
sc.fit(X)
X = sc.transform(X)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle=True, random_state = 0)
for alpha in np.linspace(0,1,100):
    regr = ElasticNet(random_state=0,alpha=alpha, l1_ratio=0.1)
    regr.fit(x_train, y_train)
    coef = pd.Series(regr.coef_, index = donnees.columns[1:2314])
    serie = pd.Series(coef[coef!=0])
    r2_train.append(100*regr.score(x_train, y_train))
    y_pred = regr.predict(x_test)
    y_true = y_test
    r2_test.append(100*r2_score(y_true, y_pred))
    l_alpha.append(alpha)
    l_serie.append(serie)
    l_taille.append(len(serie))
    l_var.append(v_y)
        
df_rsquare100 = pd.DataFrame({
        'VariableDemographique':l_var, 
        'r2_train':r2_train, 
        'r2_test':r2_test, 
        'alpha':l_alpha,
        'nb_variables': l_taille})

In [None]:
Bicycle100 = df_rsquare100[df_rsquare100["VariableDemographique"]=="Household possessions: Bicycle"]
Bicycle100 = Bicycle100[Bicycle100["r2_test"]>=0]
Bicycle100.sort_values(by="r2_test", ascending=False).head()

In [None]:
Bicycle100 = Bicycle100.sort_values(by="nb_variables")
fig, ax = plt.subplots()
Bicycle100.plot(x = 'nb_variables', y = 'r2_train', ax = ax, color="red",ylabel="R2 score", ylim=(0,100), xlim=(0,2500))
Bicycle100.plot(x = 'nb_variables', y = 'r2_test', ax = ax, color="blue")
plt.axvline(x=1763, ymax=0.9362, ls='--', lw=0.75, c="gray")
plt.axhline(y=93.62, xmax=1763/2500, ls='--', lw=0.75, c="gray")
plt.axhline(y=81.84, xmax=1763/2500, ls='--', lw=0.75, c="green")
plt.axvline(x=1544, ls='--', lw=0.75, c="y")

In [None]:
fig.savefig("../resultats_graphiques/velo3.png")

In [None]:
r2_test_max=Bicycle100["r2_test"].max()

alpha = Bicycle100[Bicycle100["r2_test"]==r2_test_max]["alpha"].values[0]
l1_ratio = grid_search.best_estimator_.l1_ratio

regr = ElasticNet(random_state=0,alpha=alpha, l1_ratio=l1_ratio)
regr.fit(x_train, y_train)

print("r2_train : ")
print(100*regr.score(x_train, y_train))

y_pred = regr.predict(x_test)
y_true = y_test
print("\n r2_test : ")
print(100*r2_score(y_true, y_pred))

coef = pd.Series(regr.coef_, index = df.columns)
serie = pd.Series(coef[coef!=0])
print("Nombre de variables sélectionnées :")
print(len(serie))

In [None]:
my_df100b = donnees[["id_district", "Household possessions: Bicycle"]]
my_df100b["Household possessions: Bicycle predicted"] = regr.predict(X)
my_df100b[["Household possessions: Bicycle", "Household possessions: Bicycle predicted"]].corr()

In [None]:
%%time

# Paramètres optimaux pour 20 valeurs de alpha

y = donnees["Household possessions: Bicycle"]
X =np.array(df)
sc = StandardScaler()
sc.fit(X)
X = sc.transform(X)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle=True, random_state = 0)
param_grid = [
        {'alpha': np.linspace(0,1,20), 'l1_ratio': [.1, .2, .3, .4, .5, .6, .7, .8, .9, 1]},
]


grid_search = GridSearchCV(ElasticNet(), param_grid, cv=3,
                               scoring='r2',
                               return_train_score=True)
grid_search.fit(x_train, y_train)

print(grid_search.best_estimator_)
print(grid_search.best_score_)
print(grid_search.score(x_test, y_test))



#------------------------------------------------------------
cv = KFold(3, random_state=0, shuffle=True)
alpha = grid_search.best_estimator_.alpha
l1_ratio = grid_search.best_estimator_.l1_ratio

regr = ElasticNet(random_state=0,alpha=alpha, l1_ratio=l1_ratio)
regr.fit(x_train, y_train)

print("r2_train : ")
print(100*regr.score(x_train, y_train))

y_pred = regr.predict(x_test)
y_true = y_test
print("\n r2_test : ")
print(100*r2_score(y_true, y_pred))

coef = pd.Series(regr.coef_, index = df.columns)
serie = pd.Series(coef[coef!=0])
print("Nombre de variables sélectionnées :")
print(len(serie))

# print("Liste des variables sélectionnées :")
# print(list(serie.index))

In [None]:
my_df20 = donnees[["id_district", "Household possessions: Bicycle"]]
my_df20["Household possessions: Bicycle predicted"] = regr.predict(X)
my_df20[["Household possessions: Bicycle", "Household possessions: Bicycle predicted"]].corr()
my_df20[["Household possessions: Bicycle", "Household possessions: Bicycle predicted"]].corr()

In [None]:
%%time
r2_train = []
r2_test = []
l_alpha = []
l_serie = []
l_taille = []
l_var = []
cv = KFold(3, random_state=0, shuffle=True)
df = base_cdr.drop(columns="id_district")

v_y = "Household possessions: Bicycle"
y = donnees[v_y]
sc = StandardScaler()
X =np.array(df)
sc.fit(X)
X = sc.transform(X)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle=True, random_state = 0)
for alpha in np.linspace(0,1,20):
    regr = ElasticNet(random_state=0,alpha=alpha, l1_ratio=l1_ratio)
    regr.fit(x_train, y_train)
    coef = pd.Series(regr.coef_, index = donnees.columns[1:2314])
    serie = pd.Series(coef[coef!=0])
    #r2_train.append(100*cross_val_score(regr, x_train, y_train, cv=cv).mean())
    r2_train.append(100*regr.score(x_train, y_train))
    y_pred = regr.predict(x_test)
    y_true = y_test
    r2_test.append(100*r2_score(y_true, y_pred))
    l_alpha.append(alpha)
    l_serie.append(serie)
    l_taille.append(len(serie))
    l_var.append(v_y)
        
df_rsquare = pd.DataFrame({
        'VariableDemographique':l_var, 
        'r2_train':r2_train, 
        'r2_test':r2_test, 
        'alpha':l_alpha,
        'nb_variables': l_taille})

In [None]:
Bicycle = df_rsquare[df_rsquare["VariableDemographique"]=="Household possessions: Bicycle"]
Bicycle = Bicycle[Bicycle["r2_test"]>=0]
Bicycle = Bicycle.sort_values(by="nb_variables")
fig, ax = plt.subplots()
Bicycle.plot(x = 'nb_variables', y = 'r2_train', ax = ax, color="red",ylabel="R2 score")
Bicycle.plot(x = 'nb_variables', y = 'r2_test', ax = ax, color="blue")

In [None]:
fig.savefig("../resultats_graphiques/velo.png")

In [None]:
Bicycle

In [None]:
Bicycle = df_rsquare[df_rsquare["VariableDemographique"]=="Household possessions: Bicycle"]
Bicycle = Bicycle[Bicycle["r2_test"]>=0]
Bicycle = Bicycle.sort_values(by="nb_variables")
fig, ax = plt.subplots()
Bicycle.plot(x = 'nb_variables', y = 'r2_train', ax = ax, color="red",ylabel="R2 score", ylim=(0,100), xlim=(0,2500))
Bicycle.plot(x = 'nb_variables', y = 'r2_test', ax = ax, color="blue")
plt.axvline(x=1165, ymax=0.8134, ls='--', lw=0.75, c="gray")
plt.axhline(y=81.34, xmax=1165/2500, ls='--', lw=0.75, c="gray")
plt.axhline(y=79.25, xmax=1165/2500, ls='--', lw=0.75, c="green")
plt.axvline(x=1548, ls='--', lw=0.75, c="y")

In [None]:
fig.savefig("../resultats_graphiques/velo2.png")

In [None]:
alpha1 = Bicycle[Bicycle["nb_variables"]==1165]["alpha"].values[0]
alpha1

In [None]:
%%time

df = donnees.iloc[:,1:2314]

y = donnees["Household possessions: Bicycle"]
sc = StandardScaler()
X =np.array(df)
sc.fit(X)
X = sc.transform(X)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle=True, random_state = 0)
l1_ratio = grid_search.best_estimator_.l1_ratio


regr = ElasticNet(random_state=0,alpha=alpha1, l1_ratio=l1_ratio)
regr.fit(x_train, y_train)

print("r2_train : ")
print(100*regr.score(x_train, y_train))

y_pred = regr.predict(x_test)
y_true = y_test
print("\n r2_test : ")
print(100*r2_score(y_true, y_pred))

coef = pd.Series(regr.coef_, index = df.columns)
serie = pd.Series(coef[coef!=0])
print("Nombre de variables sélectionnées :")
print(len(serie))

# print("Liste des variables sélectionnées :")
# print(list(serie.index))

In [None]:
my_df = donnees[["id_district", "Household possessions: Bicycle"]]
my_df["Household possessions: Bicycle predicted"] = regr.predict(X)
my_df[["Household possessions: Bicycle", "Household possessions: Bicycle predicted"]].corr()
#my_df