## Preparation de donnees

In [1]:
import pandas as pd

data_csv = pd.read_csv('../data/data.csv')

def prepare_data(data):
    data = data.copy()
    data = data.drop(columns=['EmployeeCount','StandardHours','EmployeeNumber','Over18','YearsSinceLastPromotion','TrainingTimesLastYear','PercentSalaryHike','NumCompaniesWorked','MonthlyRate','HourlyRate','DistanceFromHome','DailyRate'])
    return data
    

## Préprocessing

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler , OneHotEncoder , LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

prepared_data = prepare_data(data_csv)
print(prepared_data.columns)


Index(['Age', 'Attrition', 'BusinessTravel', 'Department', 'Education',
       'EducationField', 'EnvironmentSatisfaction', 'Gender', 'JobInvolvement',
       'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus',
       'MonthlyIncome', 'OverTime', 'PerformanceRating',
       'RelationshipSatisfaction', 'StockOptionLevel', 'TotalWorkingYears',
       'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsWithCurrManager'],
      dtype='object')


### Split Data

In [3]:
X = prepared_data.drop(columns=['Attrition'])
y = prepared_data['Attrition']


X_train , X_test , y_train , y_test = train_test_split(X , y , test_size=0.2 ,random_state=0)

le = LabelEncoder()

y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)

### Normalisation

In [4]:
cols_num = ['Age','JobLevel', 'MonthlyIncome', 'StockOptionLevel', 'TotalWorkingYears',
        'YearsAtCompany', 'YearsInCurrentRole', 'YearsWithCurrManager']

cols_cat = ['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole',
       'MaritalStatus', 'OverTime']

transformer = ColumnTransformer([
    ('num', StandardScaler() , cols_num),
    ('cat', OneHotEncoder(), cols_cat)
])


### Pipeline

In [5]:

def make_pipeline(model):
    return Pipeline([
        ('transformer', transformer),
        ('model',model)
    ])
    


### Grid Search

In [6]:
params_grids = [{
                'model': RandomForestClassifier(),
                'param_grid':{
                        'model__n_estimators': [100, 200, 500],        # nombre d'arbres dans la forêt
                        'model__max_depth': [None, 10, 20, 30],       # profondeur maximale des arbres
                        'model__min_samples_split': [2, 5, 10],       # nombre minimum d'échantillons pour splitter un noeud
                        'model__min_samples_leaf': [1, 2, 4],
                            }
                } ,
                {
                    'model': LogisticRegression() ,
                    'param_grid':{
                        'model__C': [0.01, 0.1, 1, 10],
                        'model__solver': ['liblinear'],
                        'model__max_iter': [200, 500]
                                }
                }]
    

### Entrainement Model

In [9]:
target_names = ['No', 'Yes']

for item in params_grids:
    pipe = make_pipeline(item['model'])
    
    grid_search = GridSearchCV(
        estimator=pipe,
        param_grid=item['param_grid'],
        cv=5,
        scoring='f1',
        n_jobs=-1
    )
    
    grid_search.fit(X_train , y_train)
    
    best_model = grid_search.best_estimator_
    
    y_predict = best_model.predict(X_test)
    
    
    model = item['model']
    
    # print(grid_search.best_score_)
    print(f'model : {model} \n classification report : \n {classification_report(y_test , y_predict , target_names=target_names)}')
    

model : RandomForestClassifier() 
 classification report : 
               precision    recall  f1-score   support

          No       0.88      0.98      0.93       245
         Yes       0.79      0.31      0.44        49

    accuracy                           0.87       294
   macro avg       0.83      0.64      0.68       294
weighted avg       0.86      0.87      0.85       294

model : LogisticRegression() 
 classification report : 
               precision    recall  f1-score   support

          No       0.88      0.98      0.93       245
         Yes       0.79      0.31      0.44        49

    accuracy                           0.87       294
   macro avg       0.83      0.64      0.68       294
weighted avg       0.86      0.87      0.85       294

