# Modélisation d'un modèle baseline
- On va créer un premier modèle très rapide pour voir les premières performances que l'on peut espérer

### Chargement des données

In [None]:
import pandas as pd
from src.data_eng.prepare_db import *

full_dataset = load_dataset("telco_prepared.csv")
print("Le dataset contient", len(full_dataset), "lignes")

Liste des colonnes:

In [None]:
print(full_dataset.columns)

### Définition des types de variables
- Features en entrée par type

In [None]:
usefull_columns = ['Gender', 'Senior Citizen',
                   'Partner', 'Dependents', 'Tenure Months', 'Phone Service',
                   'Multiple Lines', 'Internet Service', 'Online Security',
                   'Online Backup', 'Device Protection', 'Tech Support', 'Streaming TV',
                   'Streaming Movies', 'Contract', 'Paperless Billing', 'Payment Method',
                   'Monthly Charges', 'Total Charges', 'CLTV', 'Churn Label', 'zip_code_twofirst_digit']
full_dataset = full_dataset[usefull_columns]
features = full_dataset.columns
continuous_features = [col for col in features if full_dataset[col].dtype != 'object']
print("Variables continues:", continuous_features)
categorical_features = [col for col in features if full_dataset[col].dtype == 'object']
print("Variables categoriques:", categorical_features)
# On vérifie qu'on oublie rien (des dates ou autres)
print("Toutes les variables sont prises en compte:",
      (set(categorical_features).union(continuous_features)).difference(set(features)))

On réattribue les bons typages

In [None]:
continuous_features = ['Tenure Months', 'Monthly Charges', 'Total Charges', 'CLTV']
categorical_features = ['Gender', 'Senior Citizen', 'Partner', 'Dependents', 'Phone Service', 'Multiple Lines',
                        'Internet Service', 'Online Security', 'Online Backup', 'Device Protection', 'Tech Support',
                        'Streaming TV', 'Streaming Movies', 'Contract', 'Paperless Billing', 'Payment Method', 'zip_code_twofirst_digit']

Réencodage de la cible

In [None]:
target_column = "Churn Label"
full_dataset[target_column] = full_dataset[target_column].map({"Yes": 1, "No": 0})
print(full_dataset[target_column].unique())


### On encode les variables catégoriques avec du one-hot encoding

In [None]:
full_dataset_encoded = pd.get_dummies(full_dataset, columns=categorical_features, drop_first=True)

### On divise le dataset en train/test/valid

In [None]:
train, test, valid = split_dataset(full_dataset_encoded, 0.7, 0.85)

### Gérons les valeurs manquantes qui sont pas gérées par la régression logistique

- Attention à fiter que sur train pour pas tricher sur la moyenne

In [None]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(train[["Total Charges"]])
train["Total Charges"] = imp.transform(train[["Total Charges"]])
test["Total Charges"] = imp.transform(test[["Total Charges"]])
valid["Total Charges"] = imp.transform(valid[["Total Charges"]])


In [None]:
 # fonction custom mais on peut utiliser sklearn
train_targets = train[target_column]
test_targets = test[target_column]
valid_targets = valid[target_column]
train = train.drop(columns=[target_column])
test = test.drop(columns=[target_column])
valid = valid.drop(columns=[target_column])
print("train:", len(train), "test", len(test), "valid", len(valid))

### Apprentissage de la régression logistique

In [None]:
from sklearn.linear_model import LogisticRegression
final_model = LogisticRegression()
final_model.fit(train,train_targets)


### Performance du modèle en test

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import classification_report,auc,roc_curve
print("Distribution de la cible en test :", valid_targets.value_counts())
prediction = final_model.predict(valid)
disp = classification_report(valid_targets, prediction)
print("Performance en validation:\n", disp)
fpr, tpr, thresholds = roc_curve(valid_targets, prediction, pos_label=1)
print("AUC:",auc(fpr, tpr))

### Performance du modèle sur train
Permet de checker l'overfitting

In [None]:
prediction = final_model.predict(train)
disp = classification_report(train_targets, prediction)
print("Performance en train:\n", disp)
fpr, tpr, thresholds = roc_curve(train_targets, prediction, pos_label=1)
print("AUC:",auc(fpr, tpr))