## Preparation de donnees

In [None]:

import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn.preprocessing import StandardScaler , OneHotEncoder , LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

data_csv = pd.read_csv('../data/data.csv')

def prepare_data(data):
    data = data.copy()
    data = data.drop(columns=['EmployeeCount','StandardHours','EmployeeNumber','Over18','YearsSinceLastPromotion','TrainingTimesLastYear','PercentSalaryHike','NumCompaniesWorked','MonthlyRate','HourlyRate','DistanceFromHome','DailyRate'])
    return data


## Préprocessing

In [2]:
prepared_data = prepare_data(data_csv)
print(prepared_data.columns)

Index(['Age', 'Attrition', 'BusinessTravel', 'Department', 'Education',
       'EducationField', 'EnvironmentSatisfaction', 'Gender', 'JobInvolvement',
       'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus',
       'MonthlyIncome', 'OverTime', 'PerformanceRating',
       'RelationshipSatisfaction', 'StockOptionLevel', 'TotalWorkingYears',
       'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsWithCurrManager'],
      dtype='object')


### Split Data

In [3]:
X = prepared_data.drop(columns=['Attrition'])
y = prepared_data['Attrition']


X_train , X_test , y_train , y_test = train_test_split(X , y , test_size=0.2 ,random_state=0)

le = LabelEncoder()

y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)

### Normalisation

In [4]:
cols_num = ['Age','JobLevel', 'MonthlyIncome', 'StockOptionLevel', 'TotalWorkingYears',
        'YearsAtCompany', 'YearsInCurrentRole', 'YearsWithCurrManager']

cols_cat = ['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole',
       'MaritalStatus', 'OverTime']

transformer = ColumnTransformer([
    ('num', StandardScaler() , cols_num),
    ('cat', OneHotEncoder(), cols_cat)
])

### Pipeline

In [5]:
def make_pipeline(model):
    return Pipeline(steps=[
    ('transformer', transformer),
    ('smote', SMOTE(random_state=42)),  # SMOTE après transformation !
    ('model', model)
])
 

### Optimisation : Grid Search

In [6]:
params_grids = [{
                'model': RandomForestClassifier(),
                'param_grid':{
                        'model__n_estimators': [100, 200, 500],        # nombre d'arbres dans la forêt
                        'model__max_depth': [None, 10, 20, 30],       # profondeur maximale des arbres
                        'model__min_samples_split': [2, 5, 10],       # nombre minimum d'échantillons pour splitter un noeud
                        'model__min_samples_leaf': [1, 2, 4],
                            }
                } ,
                {
                    'model': LogisticRegression() ,
                    'param_grid':{
                        'model__C': [0.01, 0.1, 1, 10],
                        'model__solver': ['liblinear'],
                        'model__max_iter': [200, 500]
                                }
                }]

### Entrainement et Evaluation

#### matrice de confusion

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve , auc , confusion_matrix

def plot_confusion_matrix(cm):
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.xlabel("Prédit")
    plt.ylabel("Réel")
    plt.show()


#### roc_curve

In [None]:


def roc_curv_(model):
    y_proba = model.predict_proba(y_test)[:, 1]

    fpr, tpr, thresholds = roc_curve(y_test, y_proba)
    roc_auc = auc(fpr, tpr)

    plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.2f}")
    plt.plot([0,1], [0,1], linestyle='--')
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.legend()
    plt.show()

In [None]:


target_names = ['No', 'Yes']

for item in params_grids:
    pipe = make_pipeline(item['model'])
    
    grid_search = GridSearchCV(
        estimator=pipe,
        param_grid=item['param_grid'],
        cv=5,
        scoring='f1_macro',
        n_jobs=-1
    )
    
    grid_search.fit(X_train , y_train)
    
    best_model = grid_search.best_estimator_
    
    y_predict = best_model.predict(X_test)
    
    model = item['model']
    
    # print(grid_search.best_score_)
    print(f'classification report : \n {classification_report(y_test , y_predict , target_names=target_names)}')



classification report : 
               precision    recall  f1-score   support

          No       0.89      0.93      0.91       245
         Yes       0.54      0.41      0.47        49

    accuracy                           0.84       294
   macro avg       0.71      0.67      0.69       294
weighted avg       0.83      0.84      0.83       294

classification report : 
               precision    recall  f1-score   support

          No       0.92      0.69      0.79       245
         Yes       0.31      0.69      0.43        49

    accuracy                           0.69       294
   macro avg       0.62      0.69      0.61       294
weighted avg       0.82      0.69      0.73       294



#### choix du model "Logistic Regression"

In [17]:
import joblib

model = LogisticRegression()

param_grid={
            'model__C': [0.01, 0.1, 1, 10],
            'model__solver': ['liblinear'],
            'model__max_iter': [200, 500]
            }

pipe = make_pipeline(model)
    
grid_search = GridSearchCV(
        estimator=pipe,
        param_grid=param_grid,
        cv=5,
        scoring='f1_macro',
        n_jobs=-1
    )
    
grid_search.fit(X_train , y_train)
    
best_model = grid_search.best_estimator_
    
joblib.dump(best_model , '../models/logistic_regression_model.pkl')




['../models/logistic_regression_model.pkl']