# Consignes

- élaborer deux modèles: l'un à l'aide du package random forest et l'autre à l'aide de XGboost.
    - vous pouvez utiliser un gridsearch si vous savez ce que c'est mais ce n'est pas obligatoire.
- comparer ensuite votre meilleur modèle pour les trois algortihmes étudiés (logistic regression, random forest, Xgboost) à l'aide des courbes ROC et lift
- Tracer la learning curve pour chacuns de ces modèles. 
- étudier l'importance des features de ces trois modèles à l'aide de la librairie SHAP


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score,precision_score, roc_auc_score  

  from pandas import MultiIndex, Int64Index


In [3]:
import mlflow
import mlflow.sklearn
from urllib.parse import urlparse

In [4]:
df = pd.read_csv("data/intermediate/Telco_post_analysis.csv")

In [5]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
df["Churn"]= le.fit_transform(df["Churn"])

In [6]:
df["Churn"] = LabelEncoder().fit_transform(df["Churn"])
for col in df.drop("customerID",axis=1).columns:
    df[col][(df[col]=="No") | (df[col]=="Male") | (df[col]=="DSL") | (df[col]=="Only Internet")] = 0
    df[col][(df[col]=="Yes") | (df[col]=="Female") | (df[col]=="Fiber optic") | (df[col]=="Phone and Internet")] = 1
    df[col][(df[col]=="No phone service") | (df[col]=="No internet service") | (df[col]=="Only Phone")] = -1
    df[col][df[col] == "Month-to-month"] = 1
    df[col][df[col] == "One year"] = 12
    df[col][df[col] == "Two year"] = 24
    df[col][df[col] == "Mailed check"] = 0
    df[col][df[col] == "Electronic check"] = 1
    df[col][df[col] == "Credit card (automatic)"] = 2
    df[col][df[col] == "Bank transfer (automatic)"] = 3
    df[col] = df[col].astype(float)

Unnamed: 0,customerID,tenure,MonthlyCharges,TotalCharges,Churn,nbr_option_internet,SeniorCitizen,gender_Male,Partner_Yes,Dependents_Yes,...,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,Service_Only Phone,Service_Phone and Internet
0,7590-VHVEG,1,29.85,29.85,0,1,0,0,1,0,...,0,0,0,0,1,0,1,0,0,0
1,5575-GNVDE,34,56.95,1889.5,0,2,0,1,0,0,...,0,0,1,0,0,0,0,1,0,1
2,3668-QPYBK,2,53.85,108.15,1,2,0,1,0,0,...,0,0,0,0,1,0,0,1,0,1
3,7795-CFOCW,45,42.3,1840.75,0,3,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
4,9237-HQITU,2,70.7,151.65,1,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,1


In [None]:
features_standard = df.drop(["customerID","Churn"], axis=1).columns
standard_pipeline = make_pipeline(StandardScaler())

preprocessor = make_column_transformer((standard_pipeline,features_standard))

In [None]:
dt = make_pipeline(preprocessor, GaussianNB(priors=[0.8,0.2]))
rf = make_pipeline(preprocessor, SVC(random_state=1, probability=True, class_weight={0:1, 1:4}, kernel="linear", C= 0.01))
xgb = make_pipeline(preprocessor, SVC(random_state=1, probability=True, class_weight={0:1, 1:4}, kernel="linear", C= 0.01))

In [7]:
X = df.drop(["customerID", "Churn"], axis=1)
y = df.Churn

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 1)

In [9]:
def eval_metrics(actual, pred):
    accuracy = accuracy_score(actual, pred)
    f1 = f1_score(actual, pred)
    recall = recall_score(actual, pred)
    precision = precision_score(actual, pred)
    auc = roc_auc_score(actual, pred)
    return accuracy, f1, recall, precision, auc

In [10]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_decision_tree = {"class_weight":[{0: 1, 1: 1},{0: 1, 1: 3},{0: 1, 1: 1.5}],"max_depth":[None,3,5,10,20,30],
                       "min_samples_split":[2,3,4,5,6,7,8,9],"max_features":["auto","log2"], "criterion": ["gini","entropy"]}
grid_decision = GridSearchCV(DecisionTreeClassifier(random_state=1),param_decision_tree, scoring='roc_auc',cv=5)
grid_decision.fit(X_train, y_train)
grid_decision.best_params_

In [None]:
param_random_forest = {"class_weight":[{0: 1, 1: 1},{0: 1, 1: 3},{0: 1, 1: 1.5}],"n_estimators":[100,150,200], "criterion": ["gini","entropy"],
                       "max_depth":[None,1,10,15,20,30], "min_samples_split":[2,3,4,5,6,7,8,9],"max_features":["auto","log2"]}
grid_random_forest = GridSearchCV(RandomForestClassifier(random_state=1),param_decision_tree,scoring='roc_auc')
grid_random_forest.fit(X_train, y_train)
grid_random_forest.best_params_

In [None]:
mlflow.sklearn.autolog()

with mlflow.start_run(experiment_id = 1):
    dt = DecisionTreeClassifier(class_weight={0: 1, 1: 1},max_depth= 5, max_features= 'auto', min_samples_split= 9,random_state=1)
    model = dt.fit(X_train,y_train)
    y_pred = model.predict(X_test)

    accuracy, f1, recall, precision, auc = eval_metrics(y_test, y_pred)
    print("  accuracy: %s" % accuracy)
    print("  f1: %s" % f1)
    print("  recall: %s" % recall) 
    print("  precision: %s" % precision) 
    print("  auc: %s" % auc) 

    mlflow.log_param("Variables", X_test.columns)
    
    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

    if tracking_url_type_store != "file":
        mlflow.sklearn.log_model(dt, "model", registered_model_name="decision_tree")
    else:
        mlflow.sklearn.log_model(dt, "model") 

In [1]:
with mlflow.start_run(experiment_id = 1):
    rfc = RandomForestClassifier(class_weight={0:1,1:3},max_depth= 10, max_features= 'auto', min_samples_split= 8,criterion= 'entropy')
    model = rfc.fit(X_train,y_train)
    y_pred = model.predict(X_test)

    accuracy, f1, recall, precision, auc = eval_metrics(y_test, y_pred)
    print("  accuracy: %s" % accuracy)
    print("  f1: %s" % f1)
    print("  recall: %s" % recall) 
    print("  precision: %s" % precision) 
    print("  auc: %s" % auc) 

    mlflow.log_param("Variables", X_test.columns)
    
    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

    if tracking_url_type_store != "file":
        mlflow.sklearn.log_model(rfc, "model", registered_model_name="random_forest_classifier")
    else:
        mlflow.sklearn.log_model(rfc, "model") 

NameError: name 'mlflow' is not defined

In [22]:
with mlflow.start_run(experiment_id = 1):
    xg = XGBClassifier(use_label_encoder=False,max_depth=4,eval_metrics="auc")
    model = xg.fit(X_train,y_train)
    y_pred = model.predict(X_test)

    accuracy, f1, recall, precision, auc = eval_metrics(y_test, y_pred)
    print("  accuracy: %s" % accuracy)
    print("  f1: %s" % f1)
    print("  recall: %s" % recall) 
    print("  precision: %s" % precision) 
    print("  auc: %s" % auc) 

    mlflow.log_param("Variables", X_test.columns)
    
    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

    if tracking_url_type_store != "file":
        mlflow.sklearn.log_model(xg, "model", registered_model_name="xgboost")
    else:
        mlflow.sklearn.log_model(xg, "model") 

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Parameters: { "eval_metrics" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


  accuracy: 0.783226723525231
  f1: 0.552129221732746
  recall: 0.5136612021857924
  precision: 0.5968253968253968
  auc: 0.6958315617076898


