In [23]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report


In [24]:
# loading the dataset
df = pd.read_csv(r'Employee dataset.csv')

In [25]:
#dropping the id column
df.drop('employee_id',axis = 1,inplace = True)

In [26]:
# Accessing first 5 rows
df.head()

Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs,awards_won?,avg_training_score,is_promoted
0,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
3,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,0
4,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,0


In [27]:
#accessing last 5 rows
df.tail()

Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs,awards_won?,avg_training_score,is_promoted
54803,Technology,region_14,Bachelor's,m,sourcing,1,48,3.0,17,0,0,78,0
54804,Operations,region_27,Master's & above,f,other,1,37,2.0,6,0,0,56,0
54805,Analytics,region_1,Bachelor's,m,other,1,27,5.0,3,1,0,79,0
54806,Sales & Marketing,region_9,,m,sourcing,1,29,1.0,2,0,0,45,0
54807,HR,region_22,Bachelor's,m,other,1,27,1.0,5,0,0,49,0


In [28]:
# finding the null values
df.isna().sum()

department                 0
region                     0
education               2409
gender                     0
recruitment_channel        0
no_of_trainings            0
age                        0
previous_year_rating    4124
length_of_service          0
KPIs                       0
awards_won?                0
avg_training_score         0
is_promoted                0
dtype: int64

In [29]:
df.columns

Index(['department', 'region', 'education', 'gender', 'recruitment_channel',
       'no_of_trainings', 'age', 'previous_year_rating', 'length_of_service',
       'KPIs', 'awards_won?', 'avg_training_score', 'is_promoted'],
      dtype='object')

In [30]:
# filling the null values

df['previous_year_rating'].fillna( df['previous_year_rating'].median(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['previous_year_rating'].fillna( df['previous_year_rating'].median(), inplace=True)


In [31]:
df['education'].fillna( df['education'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['education'].fillna( df['education'].mode()[0], inplace=True)


In [32]:
df.isna().sum()

department              0
region                  0
education               0
gender                  0
recruitment_channel     0
no_of_trainings         0
age                     0
previous_year_rating    0
length_of_service       0
KPIs                    0
awards_won?             0
avg_training_score      0
is_promoted             0
dtype: int64

In [33]:
#finding the datatypes
df.dtypes

department               object
region                   object
education                object
gender                   object
recruitment_channel      object
no_of_trainings           int64
age                       int64
previous_year_rating    float64
length_of_service         int64
KPIs                      int64
awards_won?               int64
avg_training_score        int64
is_promoted               int64
dtype: object

In [34]:
# using the labelEncoder to convert into same datatype
le = LabelEncoder()
for i in df.columns:
    if df[i].dtype == 'object':
        df[i] = le.fit_transform(df[i])
print(df)

       department  region  education  gender  recruitment_channel  \
0               7      31          2       0                    2   
1               4      14          0       1                    0   
2               7      10          0       1                    2   
3               7      15          0       1                    0   
4               8      18          0       1                    0   
...           ...     ...        ...     ...                  ...   
54803           8       5          0       1                    2   
54804           4      19          2       0                    0   
54805           0       0          0       1                    0   
54806           7      33          0       1                    2   
54807           2      14          0       1                    0   

       no_of_trainings  age  previous_year_rating  length_of_service  KPIs  \
0                    1   35                   5.0                  8     1   
1              

In [35]:
df.dtypes

department                int32
region                    int32
education                 int32
gender                    int32
recruitment_channel       int32
no_of_trainings           int64
age                       int64
previous_year_rating    float64
length_of_service         int64
KPIs                      int64
awards_won?               int64
avg_training_score        int64
is_promoted               int64
dtype: object

In [36]:
# splitting the data
x = df.drop('is_promoted',axis = 1)
y = df['is_promoted']

In [37]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=70)

In [38]:
x_train

Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs,awards_won?,avg_training_score
43527,5,32,2,0,0,1,40,5.0,3,1,0,69
5935,7,14,2,1,2,1,40,1.0,5,0,0,47
4253,7,11,0,0,2,1,49,4.0,17,0,0,47
27610,2,11,0,1,0,1,28,3.0,1,0,0,47
12293,7,2,0,1,2,1,44,3.0,7,1,0,55
...,...,...,...,...,...,...,...,...,...,...,...,...
21563,4,15,0,1,0,2,36,3.0,5,1,0,68
25916,8,16,0,0,2,1,25,1.0,2,0,0,84
44824,8,15,0,1,0,1,29,5.0,2,1,0,82
21618,2,11,2,1,2,2,40,3.0,2,0,0,47


In [39]:
x_test

Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs,awards_won?,avg_training_score
49537,7,11,2,1,2,1,57,5.0,13,0,0,50
12688,2,11,2,1,2,1,47,2.0,4,0,0,49
14321,7,24,2,1,2,1,35,5.0,8,0,0,48
51045,1,14,2,1,0,2,40,4.0,3,1,0,61
19325,8,11,0,0,2,1,32,4.0,7,0,0,77
...,...,...,...,...,...,...,...,...,...,...,...,...
53732,4,24,0,1,0,1,42,4.0,16,0,0,59
24484,7,11,0,1,0,1,27,3.0,1,0,0,51
35025,4,13,0,1,0,1,36,5.0,8,1,0,57
32216,5,11,2,1,0,1,47,3.0,13,0,0,72


In [40]:
y_train

43527    1
5935     0
4253     0
27610    0
12293    0
        ..
21563    0
25916    0
44824    0
21618    0
23886    0
Name: is_promoted, Length: 38365, dtype: int64

In [41]:
y_test

49537    0
12688    0
14321    0
51045    0
19325    0
        ..
53732    0
24484    0
35025    0
32216    0
27809    0
Name: is_promoted, Length: 16443, dtype: int64

In [42]:
from sklearn.naive_bayes import GaussianNB
g=GaussianNB()
g.fit(x_train,y_train)
y_pred = g.predict(x_test)
acc = accuracy_score(y_test,y_pred)
print(acc)

0.9113300492610837


In [43]:
confusion_matrix(y_test,y_pred)

array([[14836,   209],
       [ 1249,   149]], dtype=int64)

In [44]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.92      0.99      0.95     15045
           1       0.42      0.11      0.17      1398

    accuracy                           0.91     16443
   macro avg       0.67      0.55      0.56     16443
weighted avg       0.88      0.91      0.89     16443



In [45]:
from xgboost import XGBClassifier
xg  = XGBClassifier()
xg.fit(x_train,y_train)

In [46]:
y_pred = xg.predict(x_test)
acc = accuracy_score(y_test,y_pred)
print(acc)

0.9402177218269172


In [47]:
confusion_matrix(y_test,y_pred)

array([[14963,    82],
       [  901,   497]], dtype=int64)

In [48]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.94      0.99      0.97     15045
           1       0.86      0.36      0.50      1398

    accuracy                           0.94     16443
   macro avg       0.90      0.68      0.74     16443
weighted avg       0.94      0.94      0.93     16443



In [49]:
from sklearn.svm import SVC
s = SVC()
s.fit(x_train,y_train)

In [50]:
y_pred = s.predict(x_test)
acc = accuracy_score(y_test,y_pred)
print(acc)

0.9149790184272943


In [51]:
confusion_matrix(y_test,y_pred)

array([[15045,     0],
       [ 1398,     0]], dtype=int64)

In [52]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.91      1.00      0.96     15045
           1       0.00      0.00      0.00      1398

    accuracy                           0.91     16443
   macro avg       0.46      0.50      0.48     16443
weighted avg       0.84      0.91      0.87     16443



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [53]:
from sklearn.ensemble import VotingClassifier
naive_bayes_classifier = GaussianNB()
svc_classifier = SVC(probability=True)
v = VotingClassifier(estimators=[
                ('naive_bayes', naive_bayes_classifier),
                ('svc', svc_classifier)
            ], voting='soft')
     

In [54]:
v.fit(x_train,y_train)

In [None]:
y_pred = v.predict(x_test)
acc = accuracy_score(y_test,y_pred)
print(acc)

0.9166210545520891


In [None]:
# prediction
abc = [[7,	11,	0	,0	,2,	1,	49,	4.0,	17	,0,	0,	47]]
a = v.predict(abc)
print(a)


[0]




In [None]:
if a == 0:
    print('not promoted')
else:
    print('promoted')

not promoted
