In [1]:
import numpy as np
import pandas as pd

# **Reading the files**

In [2]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [3]:
df=pd.concat([df_train,df_test],ignore_index=True)

# **Exploring the dataframe**

In [4]:
df.head()

Unnamed: 0,Employee ID,Age,Gender,Years at Company,Job Role,Monthly Income,Work-Life Balance,Job Satisfaction,Performance Rating,Number of Promotions,...,Number of Dependents,Job Level,Company Size,Company Tenure,Remote Work,Leadership Opportunities,Innovation Opportunities,Company Reputation,Employee Recognition,Attrition
0,8410,31,Male,19,Education,5390,Excellent,Medium,Average,2,...,0,Mid,Medium,89,No,No,No,Excellent,Medium,Stayed
1,64756,59,Female,4,Media,5534,Poor,High,Low,3,...,3,Mid,Medium,21,No,No,No,Fair,Low,Stayed
2,30257,24,Female,10,Healthcare,8159,Good,High,Low,0,...,3,Mid,Medium,74,No,No,No,Poor,Low,Stayed
3,65791,36,Female,7,Education,3989,Good,High,High,1,...,2,Mid,Small,50,Yes,No,No,Good,Medium,Stayed
4,65026,56,Male,41,Education,4821,Fair,Very High,Average,0,...,0,Senior,Medium,68,No,No,No,Fair,Medium,Stayed


**Information about DataFrame**

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74498 entries, 0 to 74497
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Employee ID               74498 non-null  int64 
 1   Age                       74498 non-null  int64 
 2   Gender                    74498 non-null  object
 3   Years at Company          74498 non-null  int64 
 4   Job Role                  74498 non-null  object
 5   Monthly Income            74498 non-null  int64 
 6   Work-Life Balance         74498 non-null  object
 7   Job Satisfaction          74498 non-null  object
 8   Performance Rating        74498 non-null  object
 9   Number of Promotions      74498 non-null  int64 
 10  Overtime                  74498 non-null  object
 11  Distance from Home        74498 non-null  int64 
 12  Education Level           74498 non-null  object
 13  Marital Status            74498 non-null  object
 14  Number of Dependents  

**Descriptive statistics of DataFrame**

In [6]:
df.describe()

Unnamed: 0,Employee ID,Age,Years at Company,Monthly Income,Number of Promotions,Distance from Home,Number of Dependents,Company Tenure
count,74498.0,74498.0,74498.0,74498.0,74498.0,74498.0,74498.0,74498.0
mean,37249.5,38.529746,15.721603,7299.379514,0.832935,49.991584,1.650326,55.727456
std,21505.864514,12.083456,11.223744,2152.508566,0.995289,28.513611,1.553633,25.399349
min,1.0,18.0,1.0,1226.0,0.0,1.0,0.0,2.0
25%,18625.25,28.0,7.0,5652.0,0.0,25.0,0.0,36.0
50%,37249.5,39.0,13.0,7348.0,1.0,50.0,1.0,56.0
75%,55873.75,49.0,23.0,8876.0,2.0,75.0,3.0,76.0
max,74498.0,59.0,51.0,16149.0,4.0,99.0,6.0,128.0


**Shape of DataFrame**

In [7]:
df.shape

(74498, 24)

**Columns in DataFrame**

In [8]:
df.columns

Index(['Employee ID', 'Age', 'Gender', 'Years at Company', 'Job Role',
       'Monthly Income', 'Work-Life Balance', 'Job Satisfaction',
       'Performance Rating', 'Number of Promotions', 'Overtime',
       'Distance from Home', 'Education Level', 'Marital Status',
       'Number of Dependents', 'Job Level', 'Company Size', 'Company Tenure',
       'Remote Work', 'Leadership Opportunities', 'Innovation Opportunities',
       'Company Reputation', 'Employee Recognition', 'Attrition'],
      dtype='object')

**Data types of columns**

In [9]:
df.dtypes

Employee ID                  int64
Age                          int64
Gender                      object
Years at Company             int64
Job Role                    object
Monthly Income               int64
Work-Life Balance           object
Job Satisfaction            object
Performance Rating          object
Number of Promotions         int64
Overtime                    object
Distance from Home           int64
Education Level             object
Marital Status              object
Number of Dependents         int64
Job Level                   object
Company Size                object
Company Tenure               int64
Remote Work                 object
Leadership Opportunities    object
Innovation Opportunities    object
Company Reputation          object
Employee Recognition        object
Attrition                   object
dtype: object

**Number of null values in each column**

In [10]:
df.isnull().sum()

Employee ID                 0
Age                         0
Gender                      0
Years at Company            0
Job Role                    0
Monthly Income              0
Work-Life Balance           0
Job Satisfaction            0
Performance Rating          0
Number of Promotions        0
Overtime                    0
Distance from Home          0
Education Level             0
Marital Status              0
Number of Dependents        0
Job Level                   0
Company Size                0
Company Tenure              0
Remote Work                 0
Leadership Opportunities    0
Innovation Opportunities    0
Company Reputation          0
Employee Recognition        0
Attrition                   0
dtype: int64

**Number of unique values in each column**

In [11]:
df.nunique()

Employee ID                 74498
Age                            42
Gender                          2
Years at Company               51
Job Role                        5
Monthly Income               9842
Work-Life Balance               4
Job Satisfaction                4
Performance Rating              4
Number of Promotions            5
Overtime                        2
Distance from Home             99
Education Level                 5
Marital Status                  3
Number of Dependents            7
Job Level                       3
Company Size                    3
Company Tenure                127
Remote Work                     2
Leadership Opportunities        2
Innovation Opportunities        2
Company Reputation              4
Employee Recognition            4
Attrition                       2
dtype: int64

**Number of duplicate rows**

In [12]:
df.duplicated().sum()

0

# **Data Preprocessing for Machine Learning**

In [13]:
# Divide the dataframe into features (X) and target (y)
X= df.drop(['Employee ID', "Attrition"], axis=1)
y= df['Attrition']

**Handling Categorical Features in X**

In [14]:
def encode_categorical_columns(df):
  df_encoded = pd.get_dummies(df, drop_first=True)
  return df_encoded

X = encode_categorical_columns(X)

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import VotingClassifier, StackingClassifier

In [16]:
def apply_models(X, y):
    scaler = StandardScaler()
    le = LabelEncoder()
    y = le.fit_transform(y)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    smote = SMOTE(random_state=42)
    X_train, y_train = smote.fit_resample(X_train, y_train)

    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    models = {
        'LogisticRegression': OneVsRestClassifier(LogisticRegression()),
        'DecisionTree': OneVsRestClassifier(DecisionTreeClassifier()),
        'RandomForest': OneVsRestClassifier(RandomForestClassifier()),
        'KNeighbors': OneVsRestClassifier(KNeighborsClassifier()),
        "SVC": OneVsRestClassifier(SVC())
    }
    model_performance = {}

    for model_name, model in models.items():
        print(f"\nClassification with {model_name}:\n{'-' * 30}")

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        y_test_orig = le.inverse_transform(y_test)
        y_pred_orig = le.inverse_transform(y_pred)

        accuracy = accuracy_score(y_test_orig, y_pred_orig)
        f1 = f1_score(y_test_orig, y_pred_orig, average='weighted')
        model_performance[model_name] = (accuracy, f1)

        print("\Accuracy\n", accuracy)
        print("\nConfusion Matrix\n", confusion_matrix(y_test_orig, y_pred_orig))
        print("\nClassification Report\n", classification_report(y_test_orig, y_pred_orig))

In [17]:
apply_models(X, y)


[1mClassification with LogisticRegression:[0m
------------------------------
\Accuracy
 0.752751677852349

Confusion Matrix
 [[5353 1743]
 [1941 5863]]

Classification Report
               precision    recall  f1-score   support

        Left       0.73      0.75      0.74      7096
      Stayed       0.77      0.75      0.76      7804

    accuracy                           0.75     14900
   macro avg       0.75      0.75      0.75     14900
weighted avg       0.75      0.75      0.75     14900


[1mClassification with DecisionTree:[0m
------------------------------
\Accuracy
 0.6731543624161074

Confusion Matrix
 [[4709 2387]
 [2483 5321]]

Classification Report
               precision    recall  f1-score   support

        Left       0.65      0.66      0.66      7096
      Stayed       0.69      0.68      0.69      7804

    accuracy                           0.67     14900
   macro avg       0.67      0.67      0.67     14900
weighted avg       0.67      0.67      0.67     