In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split , GridSearchCV

from sklearn.preprocessing import StandardScaler , OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

data_csv = pd.read_csv('../data/data.csv')

def prepare_data(data):
    data = data.copy()
    data = data.drop(columns=['EmployeeCount','StandardHours','EmployeeNumber','Over18','YearsSinceLastPromotion','TrainingTimesLastYear','PercentSalaryHike','NumCompaniesWorked','MonthlyRate','HourlyRate','DistanceFromHome','DailyRate'])
    return data



prepared_data = prepare_data(data_csv)
print(prepared_data.columns)

X = prepared_data.drop(columns=['Attrition'])
y = prepared_data['Attrition']


X_train , X_test , y_train , y_test = train_test_split(X , y , test_size=0.2 ,random_state=0)


cols_num = ['Age','JobLevel', 'MonthlyIncome', 'StockOptionLevel', 'TotalWorkingYears',
        'YearsAtCompany', 'YearsInCurrentRole', 'YearsWithCurrManager']

cols_cat = ['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole',
       'MaritalStatus', 'OverTime']

transformer = ColumnTransformer([
    ('num', StandardScaler() , cols_num),
    ('cat', OneHotEncoder(), cols_cat)
])


pipeline_smote = Pipeline(steps=[
    ('transformer', transformer),
    ('smote', SMOTE(random_state=42)),  # SMOTE apr√®s transformation !
    ('model', LogisticRegression(max_iter=1000))
])

param_grid = {
    'model__C': [0.01, 0.1, 1, 10],
    'model__solver': ['liblinear'],
}

grid_search_smote = GridSearchCV(
    pipeline_smote,
    param_grid=param_grid,
    scoring='f1_macro',
    cv=5,
    n_jobs=-1,
    verbose=1
)

grid_search_smote.fit(X_train, y_train)
best_model_smote = grid_search_smote.best_estimator_

y_pred = best_model_smote.predict(X_test)
print(classification_report(y_test, y_pred))


Index(['Age', 'Attrition', 'BusinessTravel', 'Department', 'Education',
       'EducationField', 'EnvironmentSatisfaction', 'Gender', 'JobInvolvement',
       'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus',
       'MonthlyIncome', 'OverTime', 'PerformanceRating',
       'RelationshipSatisfaction', 'StockOptionLevel', 'TotalWorkingYears',
       'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsWithCurrManager'],
      dtype='object')
Fitting 5 folds for each of 4 candidates, totalling 20 fits
              precision    recall  f1-score   support

          No       0.92      0.69      0.79       245
         Yes       0.31      0.69      0.43        49

    accuracy                           0.69       294
   macro avg       0.62      0.69      0.61       294
weighted avg       0.82      0.69      0.73       294

