In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

import warnings
warnings.filterwarnings("ignore")

In [None]:
# Données
%run base.ipynb
df = donnees.iloc[:,1:2314]

In [None]:
%%time

y = donnees["Has electricity: Yes"]
X =np.array(df)
sc = StandardScaler()
sc.fit(X)
X = sc.transform(X)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle=True, random_state = 0)
param_grid = [
        {'alpha': np.linspace(0,1,26), 'l1_ratio': [.1, .2, .3, .4, .5, .6, .7, .8, .9, 1]},
]


grid_search = GridSearchCV(ElasticNet(), param_grid, cv=3,
                               scoring='r2',
                               return_train_score=True)
grid_search.fit(x_train, y_train)

print(grid_search.best_estimator_)
print(grid_search.best_score_)
print(grid_search.score(x_test, y_test))

In [None]:
%%time

alpha = grid_search.best_estimator_.alpha
l1_ratio = grid_search.best_estimator_.l1_ratio

regr = ElasticNet(random_state=0,alpha=alpha, l1_ratio=l1_ratio)
regr.fit(x_train, y_train)

print("r2_train : ")
print(100*regr.score(x_train, y_train))

y_pred = regr.predict(x_test)
y_true = y_test
print("\n r2_test : ")
print(100*r2_score(y_true, y_pred))

coef = pd.Series(regr.coef_, index = df.columns)
serie = pd.Series(coef[coef!=0])
print("Nombre de variables sélectionnées :")
print(len(serie))

# print("Liste des variables sélectionnées :")
# print(list(serie.index))

In [None]:
my_df = donnees[["id_district", "Has electricity: Yes"]]
my_df["Has electricity: Yes predicted"] = regr.predict(X)
my_df[["Has electricity: Yes", "Has electricity: Yes predicted"]].corr()

In [None]:
cdr_electricite = list(serie.index)
cdr_electricite = pd.DataFrame({"cdr_electricite":cdr_electricite})
cdr_electricite.to_csv("../resultats_tableaux/cdr_electricite.csv",index=False)

In [None]:
%%time
r2_train = []
r2_test = []
l_alpha = []
l_serie = []
l_taille = []
l_var = []

for alpha in np.linspace(0,1,26):
    regr = ElasticNet(random_state=0,alpha=alpha, l1_ratio=l1_ratio)
    regr.fit(x_train, y_train)
    coef = pd.Series(regr.coef_, index = donnees.columns[1:2314])
    serie = pd.Series(coef[coef!=0])
    r2_train.append(100*regr.score(x_train, y_train))
    y_pred = regr.predict(x_test)
    y_true = y_test
    r2_test.append(100*r2_score(y_true, y_pred))
    l_alpha.append(alpha)
    l_serie.append(serie)
    l_taille.append(len(serie))
     #l_var.append(v_y)
        
df_rsquare = pd.DataFrame({
        #'VariableDemographique':l_var, 
        'r2_train':r2_train, 
        'r2_test':r2_test, 
        'alpha':l_alpha,
        'nb_variables': l_taille})

df_rsquare.sort_values(by="r2_test", ascending=False).head()

In [None]:
electricity = df_rsquare
electricity = electricity[electricity["r2_test"]>=0]
electricity = electricity.sort_values(by="nb_variables")
fig, ax = plt.subplots()
electricity.plot(x = 'nb_variables', y = 'r2_train', ax = ax, color="red",ylabel="R2 score", xlim=(0,160), ylim=(0,100))
electricity.plot(x = 'nb_variables', y = 'r2_test', ax = ax, color="blue")
plt.axvline(x=23, ymax=0.6970, ls='--', lw=0.75, c="gray")
plt.axhline(y=69.70, xmax=23/160, ls='--', lw=0.75, c="gray")
plt.axhline(y=63.27, xmax=23/160, ls='--', lw=0.75, c="green")
plt.axvline(x=74, ls='--', lw=0.75, c="y")

In [None]:
fig.savefig("../resultats_graphiques/electricite.png")