# Second prediction notebook

Prediction of the CO2 emissions.

* [Imports](#imports)
* [Data loading](#data-loading)
* [Feature seletion](#feature-seletion)
    * [Selection](#selection-pipeline)

<a name="imports"></a>
## Imports

In [None]:
!pip install git+https://github.com/Xmaster6y/ML-Engineer@develop 

In [None]:
import os
import pandas as pd
import numpy as np
import scipy as sc
import seaborn as sns
import matplotlib.pyplot as plt
from time import time

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, KFold

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler, MinMaxScaler
from sklearn.impute import KNNImputer

from sklearn.dummy import DummyRegressor

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score

In [None]:
import helpers

In [None]:
dir(helpers)

<a name="data-loading"></a>
## Data loading

In [None]:
data_sub_path = 'data_sub.zip'
if not os.path.exists(data_sub_path):
    !wget "https://drive.google.com/uc?export=download&id=1iLaCRkLt5VwdkJeIFzuy4ZfjgYqJKpDw" -q --show-progress -O "$data_sub_path"
    !unzip $data_sub_path

In [None]:
df = pd.read_csv(f"data_sub_00.csv")
df.info()

## Data Drift

### Univariate

In [None]:
df_0 = pd.read_csv("data_sub_00.csv")
df_0["year"] = "1 year"
print(df_0.shape)
df_5 = pd.read_csv("data_sub_05.csv")
df_5["year"] = "1.5 years"
print(df_5.shape)
df_11 = pd.read_csv("data_sub_11.csv")
df_11["year"] = "2 years"
print(df_11.shape)

In [None]:
sns.histplot(df_0["recency"], kde=True,log_scale=True, stat="density", label="1 year")
sns.histplot(df_11["recency"], kde=True, log_scale=True, stat="density", label="2 years")
plt.legend(loc="best")

In [None]:
sns.histplot(df_0["amount"], kde=True,log_scale=True, stat="density", label="1 year")
sns.histplot(df_11["amount"], kde=True, log_scale=True, stat="density", label="2 years")
plt.legend(loc="best")

In [None]:
sns.histplot(df_0["delay"], kde=True, stat="density", label="1 year")
sns.histplot(df_11["delay"], kde=True, stat="density", label="2 years")
plt.legend(loc="best")

In [None]:
sns.histplot(df_0["quantity"], stat="percent", label="1 year")
sns.histplot(df_11["quantity"], stat="percent", label="2 years")
plt.legend(loc="best")

In [None]:
sns.histplot(df_0["frequency"], stat="percent", label="1 year")
sns.histplot(df_11["frequency"], stat="percent", label="2 years")
plt.legend(loc="best")

In [None]:
sns.histplot(df_0["mean_satisfaction"], stat="percent", label="1 year")
sns.histplot(df_11["mean_satisfaction"], stat="percent", label="2 years")
plt.legend(loc="best")

In [None]:
sns.histplot(df_0["least_satisfaction"], stat="percent", label="1 year")
sns.histplot(df_11["least_satisfaction"], stat="percent", label="2 years")
plt.legend(loc="best")

In [None]:
sns.histplot(df_0["localisation"], stat="percent", label="1 year")
sns.histplot(df_11["localisation"], stat="percent", label="2 year")
plt.legend(loc="best")
plt.xticks(rotation=45)

### Bi-variate

In [None]:
sns.scatterplot(df_11, x="amount", y="delay", label="2 year")
sns.scatterplot(df_0, x="amount", y="delay", label="1 year")
plt.legend(loc="best")

In [None]:
sns.scatterplot(df_11, x="amount", y="least_satisfaction", label="2 year")
sns.scatterplot(df_0, x="amount", y="least_satisfaction", label="1 year")
plt.legend(loc="best")

## RFM

### Feature selection

In [None]:
rfm_cols = [
    "recency",
    "frequency",
    "amount",
]
rfm_tr = Pipeline([
    ("scaler", StandardScaler())
    ])

rfm_prep = ColumnTransformer([
    ("rfm_tr", rfm_tr, rfm_cols),
    ])
N_C=3

### Different models

In [None]:
to_plot = []
rates = [2, 4, 9, 12]
for j in rates:
    df = pd.read_csv(f"data_sub_00.csv")
    X_rfm_0 = rfm_prep.fit_transform(df)
    base_model = KMeans(n_clusters=N_C, random_state=42, n_init=3)
    base_model.fit(X_rfm_0)
    scores = []
    for i in range(0, 12):
        df = pd.read_csv(f"data_sub_{i:02d}.csv")
        if i%j == 0:
            X_rfm_0 = rfm_prep.fit_transform(df)
            base_model = KMeans(n_clusters=N_C, random_state=42, n_init=3)
            base_model.fit(X_rfm_0)
        X_rfm = rfm_prep.transform(df)
        model = KMeans(n_clusters=N_C, random_state=42, n_init=3)
        y_model = model.fit_predict(X_rfm)
        y_base = base_model.predict(X_rfm)
        scores.append(adjusted_rand_score(y_model, y_base))

    to_plot.append(scores)

    

In [None]:
for j, r in enumerate(rates):
    s = to_plot[j]
    plt.plot(s, label=f"maintenance rate {r} months")
plt.legend(loc="best")
plt.plot()

In [None]:
df = pd.read_csv(f"data_sub_00.csv")
X_rfm_0 = rfm_prep.fit_transform(df)
base_model = KMeans(n_clusters=N_C, random_state=42, n_init=3)
base_model.fit(X_rfm_0)
mean_scores = []
std_scores = []
seeds = [1, 2, 3, 4, 5]
for i in range(0, 12):
    df = pd.read_csv(f"data_sub_{i:02d}.csv")
    X_rfm = rfm_prep.transform(df)
    y_base = base_model.predict(X_rfm)
    local_scores = []
    for seed in seeds:
        model = KMeans(n_clusters=N_C, random_state=seed, n_init=3)
        y_model = model.fit_predict(X_rfm)
        s1 = adjusted_rand_score(y_model, y_base)
        s2 = adjusted_mutual_info_score(y_model, y_base)
        local_scores.append((s1,s2))
    mean_scores.append(tuple(map(np.mean, zip(*local_scores))))
    std_scores.append(tuple(map(np.std, zip(*local_scores))))

In [None]:
ars, amis= zip(*mean_scores)
std_ars, std_amis = zip(*std_scores)

In [None]:
x=list(range(0, 12))
plt.errorbar(x=x, y=ars, yerr=std_ars, label=f"ARS")
plt.errorbar(x=x, y=amis, yerr=std_amis, label=f"AMIS")
plt.legend(loc="best")
plt.plot()

## MSDQ

### Feature selection

In [None]:
rdq_cols = [
    "amount",
    "least_satisfaction",
    "delay",
    "quantity"
]
rdq_tr = Pipeline([
    ("scaler", StandardScaler())
    ])

rdq_prep = ColumnTransformer([
    ("rdq_tr", rdq_tr, rdq_cols),
    ])
N_C=4

### Different models

In [None]:
to_plot = []
rates = [2, 4, 9, 12]
for j in rates:
    df = pd.read_csv(f"data_sub_00.csv")
    X_rdq_0 = rdq_prep.fit_transform(df)
    base_model = KMeans(n_clusters=N_C, random_state=42, n_init='auto')
    base_model.fit(X_rdq_0)
    scores = []
    for i in range(0, 12):
        df = pd.read_csv(f"data_sub_{i:02d}.csv")
        if i%j == 0:
            X_rdq_0 = rdq_prep.fit_transform(df)
            base_model = KMeans(n_clusters=N_C, random_state=42, n_init='auto')
            base_model.fit(X_rdq_0)
        X_rdq = rdq_prep.transform(df)
        model = KMeans(n_clusters=N_C, random_state=42, n_init='auto')
        y_model = model.fit_predict(X_rdq)
        y_base = base_model.predict(X_rdq)
        scores.append(adjusted_rand_score(y_model, y_base))

    to_plot.append(scores)

    

In [None]:
for j, r in enumerate(rates):
    s = to_plot[j]
    plt.plot(s, label=f"maintenance rate {r} months")
plt.legend(loc="best")
plt.plot()

In [None]:
df = pd.read_csv(f"data_sub_00.csv")
X_rdq_0 = rdq_prep.fit_transform(df)
base_model = KMeans(n_clusters=N_C, random_state=42, n_init=3)
base_model.fit(X_rdq_0)
mean_scores = []
std_scores = []
seeds = [1, 2, 3, 4, 5]
for i in range(0, 12):
    df = pd.read_csv(f"data_sub_{i:02d}.csv")
    X_rdq = rdq_prep.transform(df)
    y_base = base_model.predict(X_rdq)
    local_scores = []
    for seed in seeds:
        model = KMeans(n_clusters=N_C, random_state=seed, n_init=3)
        y_model = model.fit_predict(X_rdq)
        s1 = adjusted_rand_score(y_model, y_base)
        s2 = adjusted_mutual_info_score(y_model, y_base)
        local_scores.append((s1,s2))
    mean_scores.append(tuple(map(np.mean, zip(*local_scores))))
    std_scores.append(tuple(map(np.std, zip(*local_scores))))

In [None]:
ars, amis = zip(*mean_scores)
std_ars, std_amis = zip(*std_scores)

In [None]:
x=list(range(0, 12))
plt.errorbar(x=x, y=ars, yerr=std_ars, label=f"ARS")
plt.errorbar(x=x, y=amis, yerr=std_amis, label=f"AMIS")
ymin, ymax = plt.ylim()
plt.vlines(3, ymin, ymax, color='k', linestyles="--",label="Advised")
plt.ylim(ymin, ymax)
plt.legend(loc="best")
plt.ylabel("Score")
plt.xlabel("Month")
plt.plot()

## MSDQ - loc

### Feature selection

In [None]:
loc_cat_cols = [
    "localisation"
]

loc_num_cols = [
    "amount",
    "least_satisfaction",
    "delay",
    "quantity"
    ]

loc_cols = loc_num_cols + loc_cat_cols

num_tr = Pipeline([
    ("scaler", StandardScaler())
    ])
cat_tr = Pipeline([
    ("encoder", OneHotEncoder())
    ])
loc_prep = ColumnTransformer([
    ("num", num_tr, loc_num_cols),
    ("cat", cat_tr, loc_cat_cols),
    ])
N_C=4

### Different models

In [None]:
to_plot = []
rates = [2, 4, 9, 12]
for j in rates:
    df = pd.read_csv(f"data_sub_00.csv")
    X_rdq_0 = loc_prep.fit_transform(df)
    base_model = KMeans(n_clusters=N_C, random_state=42, n_init=3)
    base_model.fit(X_rdq_0)
    scores = []
    for i in range(0, 12):
        df = pd.read_csv(f"data_sub_{i:02d}.csv")
        if i%j == 0:
            X_rdq_0 = loc_prep.fit_transform(df)
            base_model = KMeans(n_clusters=N_C, random_state=42, n_init=3)
            base_model.fit(X_rdq_0)
        X_rdq = loc_prep.transform(df)
        model = KMeans(n_clusters=N_C, random_state=42, n_init=3)
        y_model = model.fit_predict(X_rdq)
        y_base = base_model.predict(X_rdq)
        scores.append(adjusted_rand_score(y_model, y_base))

    to_plot.append(scores)

    

In [None]:
for j, r in enumerate(rates):
    s = to_plot[j]
    plt.plot(s, label=f"maintenance rate {r} months")
plt.legend(loc="best")
plt.plot()

In [None]:
df = pd.read_csv(f"data_sub_00.csv")
X_rdq_0 = loc_prep.fit_transform(df)
base_model = KMeans(n_clusters=N_C, random_state=42, n_init=3)
base_model.fit(X_rdq_0)
mean_scores = []
std_scores = []
seeds = [1, 2, 3, 4, 5]
for i in range(0, 12):
    df = pd.read_csv(f"data_sub_{i:02d}.csv")
    X_rdq = loc_prep.transform(df)
    y_base = base_model.predict(X_rdq)
    local_scores = []
    for seed in seeds:
        model = KMeans(n_clusters=N_C, random_state=seed, n_init=3)
        y_model = model.fit_predict(X_rdq)
        s1 = adjusted_rand_score(y_model, y_base)
        s2 = adjusted_mutual_info_score(y_model, y_base)
        local_scores.append((s1,s2))
    mean_scores.append(tuple(map(np.mean, zip(*local_scores))))
    std_scores.append(tuple(map(np.std, zip(*local_scores))))

In [None]:
ars, amis = zip(*mean_scores)
std_ars, std_amis = zip(*std_scores)

In [None]:
x=list(range(0, 12))
plt.errorbar(x=x, y=ars, yerr=std_ars, label=f"ARS")
plt.errorbar(x=x, y=amis, yerr=std_amis, label=f"AMIS")
ymin, ymax = plt.ylim()
plt.vlines(3, ymin, ymax, color='k', linestyles="--",label="Advised")
plt.ylim(ymin, ymax)
plt.legend(loc="best")
plt.ylabel("Score")
plt.xlabel("Month")
plt.plot()