In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

import warnings
warnings.filterwarnings("ignore")

In [None]:
# Données
%run base.ipynb
df = donnees.iloc[:,1:2314]

In [None]:
df.shape

In [None]:
%%time

y = np.log(donnees["Household possessions: Refrigerator"])
X =np.array(df)
sc = StandardScaler()
sc.fit(X)
X = sc.transform(X)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle=True, random_state = 0)
param_grid = [
        {'alpha': np.linspace(0,1,100), 'l1_ratio': [.1, .2, .3, .4, .5, .6, .7, .8, .9, 1]},
]


grid_search = GridSearchCV(ElasticNet(), param_grid, cv=3,
                               scoring='r2',
                               return_train_score=True)
grid_search.fit(x_train, y_train)

print(grid_search.best_estimator_)
print(grid_search.best_score_)
print(grid_search.score(x_test, y_test))

In [None]:
%%time

alpha = grid_search.best_estimator_.alpha
l1_ratio = grid_search.best_estimator_.l1_ratio


regr = ElasticNet(random_state=0,alpha=alpha, l1_ratio=l1_ratio)
regr.fit(x_train, y_train)

print("r2_train : ")
print(100*regr.score(x_train, y_train))

y_pred = regr.predict(x_test)
y_true = y_test
print("\n r2_test : ")
print(100*r2_score(y_true, y_pred))

coef = pd.Series(regr.coef_, index = df.columns)
serie = pd.Series(coef[coef!=0])
print("Nombre de variables sélectionnées :")
print(len(serie))

# print("Liste des variables sélectionnées :")
# print(list(serie.index))

In [None]:
my_df = donnees[["id_district", "Household possessions: Refrigerator"]]
my_df["Household possessions: Refrigerator predicted"] = regr.predict(X)
my_df[["Household possessions: Refrigerator", "Household possessions: Refrigerator predicted"]].corr()

In [None]:
%%time
r2_train = []
r2_test = []
l_alpha = []
l_serie = []
l_taille = []
l_var = []

for alpha in np.linspace(0,1,100):
    regr = ElasticNet(random_state=0,alpha=alpha, l1_ratio=l1_ratio)
    regr.fit(x_train, y_train)
    coef = pd.Series(regr.coef_, index = donnees.columns[1:2314])
    serie = pd.Series(coef[coef!=0])
    r2_train.append(100*regr.score(x_train, y_train))
    y_pred = regr.predict(x_test)
    y_true = y_test
    r2_test.append(100*r2_score(y_true, y_pred))
    l_alpha.append(alpha)
    l_serie.append(serie)
    l_taille.append(len(serie))
     #l_var.append(v_y)
        
df_rsquare = pd.DataFrame({
        #'VariableDemographique':l_var, 
        'r2_train':r2_train, 
        'r2_test':r2_test, 
        'alpha':l_alpha,
        'nb_variables': l_taille})

df_rsquare.sort_values(by="r2_test", ascending=False).head()

In [None]:
Refrigerator = df_rsquare
Refrigerator = Refrigerator[Refrigerator["r2_test"]>=0]
Refrigerator = Refrigerator.sort_values(by="nb_variables")
fig, ax = plt.subplots()
Refrigerator.plot(x = 'nb_variables', y = 'r2_train', ax = ax, color="red",ylabel="R2 score", ylim=(0,100), xlim=(0,120))
Refrigerator.plot(x = 'nb_variables', y = 'r2_test', ax = ax, color="blue")
plt.axvline(x=18, ymax=0.6404, ls='--', lw=0.75, c="gray")
plt.axhline(y=64.04, xmax=18/120, ls='--', lw=0.75, c="gray")
plt.axhline(y=59.67, xmax=18/120, ls='--', lw=0.75, c="green")
plt.axvline(x=52, ls='--', lw=0.75, c="y")

In [None]:
fig.savefig("../resultats_graphiques/refrigerateur.png")

In [None]:
%%time

r2_test_max=Refrigerator["r2_test"].max()

alpha = Refrigerator[Refrigerator["r2_test"]==r2_test_max]["alpha"].values[0]
l1_ratio = grid_search.best_estimator_.l1_ratio


regr = ElasticNet(random_state=0,alpha=alpha, l1_ratio=l1_ratio)
regr.fit(x_train, y_train)

print("r2_train : ")
print(100*regr.score(x_train, y_train))

y_pred = regr.predict(x_test)
y_true = y_test
print("\n r2_test : ")
print(100*r2_score(y_true, y_pred))

coef = pd.Series(regr.coef_, index = df.columns)
serie = pd.Series(coef[coef!=0])
print("Nombre de variables sélectionnées :")
print(len(serie))

# print("Liste des variables sélectionnées :")
# print(list(serie.index))

In [None]:
my_df = donnees[["id_district", "Household possessions: Refrigerator"]]
my_df["Household possessions: Refrigerator predicted"] = regr.predict(X)
my_df[["Household possessions: Refrigerator", "Household possessions: Refrigerator predicted"]].corr()

In [None]:
cdr_refrigerator = list(serie.index)
cdr_refrigerator = pd.DataFrame({"cdr_refrigerator":cdr_refrigerator})
cdr_refrigerator.to_csv("../resultats_tableaux/cdr_refrigerator.csv",index=False)

In [None]:
plt.hist(my_df["Household possessions: Refrigerator"])

In [None]:
plt.hist(np.log(my_df["Household possessions: Refrigerator"]))

In [None]:
%%time

y = np.log(donnees["Household possessions: Refrigerator"])
X =np.array(df)
sc = StandardScaler()
sc.fit(X)
X = sc.transform(X)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle=True, random_state = 0)
param_grid = [
        {'alpha': np.linspace(0,1,100), 'l1_ratio': [.1, .2, .3, .4, .5, .6, .7, .8, .9, 1]},
]


grid_search = GridSearchCV(ElasticNet(), param_grid, cv=3,
                               scoring='r2',
                               return_train_score=True)
grid_search.fit(x_train, y_train)

print(grid_search.best_estimator_)
print(grid_search.best_score_)
print(grid_search.score(x_test, y_test))

In [None]:
%%time

cv = KFold(3, random_state=0, shuffle=True)
df = donnees.iloc[:,1:2314]

y = np.log(donnees["Household possessions: Refrigerator"])
sc = StandardScaler()
X =np.array(df)
sc.fit(X)
X = sc.transform(X)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle=True, random_state = 0)
alpha = grid_search.best_estimator_.alpha
l1_ratio = grid_search.best_estimator_.l1_ratio


regr = ElasticNet(random_state=0,alpha=alpha, l1_ratio=l1_ratio)
regr.fit(x_train, y_train)

print("r2_train : ")
print(100*regr.score(x_train, y_train))

y_pred = regr.predict(x_test)
y_true = y_test
print("\n r2_test : ")
print(100*r2_score(y_true, y_pred))

coef = pd.Series(regr.coef_, index = df.columns)
serie = pd.Series(coef[coef!=0])
print("Nombre de variables sélectionnées :")
print(len(serie))

# print("Liste des variables sélectionnées :")
# print(list(serie.index))

In [None]:
my_df = donnees[["id_district", "Household possessions: Refrigerator"]]
my_df["Log Household possessions: Refrigerator predicted"] = regr.predict(X)
my_df

In [None]:
my_df[["Household possessions: Refrigerator", "Log Household possessions: Refrigerator predicted"]].corr()

In [None]:
my_df["Household possessions: Refrigerator predicted"] = np.exp(my_df["Log Household possessions: Refrigerator predicted"])

In [None]:
my_df

In [None]:
my_df[["Household possessions: Refrigerator", "Log Household possessions: Refrigerator predicted"]].corr()

In [None]:
plt.subplot(1, 2, 1) 
plt.hist(my_df["Household possessions: Refrigerator"])
plt.title("Refrigerator")
plt.xlabel('X-axis ')
plt.ylabel('Y-axis ')

plt.subplot(1, 2, 2) 
plt.hist(np.log(my_df["Household possessions: Refrigerator"]))
plt.title("Log Refrigerator")
plt.xlabel('X-axis ')
#plt.ylabel('Y-axis ')

plt.show()

In [None]:
%%time

y = np.log(donnees["Household possessions: Refrigerator"])
X =np.array(df)
sc = StandardScaler()
sc.fit(X)
X = sc.transform(X)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle=True, random_state = 0)
param_grid = [
        {'alpha': np.linspace(0,1,100), 'l1_ratio': [.1, .2, .3, .4, .5, .6, .7, .8, .9, 1]},
]


grid_search = GridSearchCV(ElasticNet(), param_grid, cv=3,
                               scoring='r2',
                               return_train_score=True)
grid_search.fit(x_train, y_train)

print(grid_search.best_estimator_)
print(grid_search.best_score_)
print(grid_search.score(x_test, y_test))