# HR Job Change Problem

In [1]:
import pandas as pd
import os

In [2]:
os.chdir('E:\SkillEnable Class\Python\Datasets')

In [3]:
# Read and access data
job = pd.read_csv('HR Job Change.csv')

In [4]:
job.head()

Unnamed: 0,enrollee_id,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_type,last_new_job,training_hours,target
0,8949,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,20.0,,1,36,1
1,29725,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15.0,Pvt Ltd,>4,47,0
2,11561,0.624,,No relevent experience,Full time course,Graduate,STEM,5.0,,never,83,0
3,33241,0.789,,No relevent experience,,Graduate,Business Degree,0.0,Pvt Ltd,never,52,1
4,666,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,20.0,Funded Startup,4,8,0


In [5]:
job.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19158 entries, 0 to 19157
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   enrollee_id             19158 non-null  int64  
 1   city_development_index  19158 non-null  float64
 2   gender                  14650 non-null  object 
 3   relevent_experience     19158 non-null  object 
 4   enrolled_university     18772 non-null  object 
 5   education_level         18698 non-null  object 
 6   major_discipline        16345 non-null  object 
 7   experience              19093 non-null  float64
 8   company_type            13018 non-null  object 
 9   last_new_job            18735 non-null  object 
 10  training_hours          19158 non-null  int64  
 11  target                  19158 non-null  int64  
dtypes: float64(2), int64(3), object(7)
memory usage: 1.8+ MB


In [6]:
# Cleaning the data / filling the null values
job.isnull().sum()

enrollee_id                  0
city_development_index       0
gender                    4508
relevent_experience          0
enrolled_university        386
education_level            460
major_discipline          2813
experience                  65
company_type              6140
last_new_job               423
training_hours               0
target                       0
dtype: int64

In [7]:
# Checking the skewness
job['experience'].skew()

0.33882560477060786

In [8]:
job['experience'].fillna(job['experience'].mean(),inplace = True)

In [9]:
job['gender'].fillna('Male', inplace = True)

In [10]:
job['enrolled_university'].value_counts()

no_enrollment       13817
Full time course     3757
Part time course     1198
Name: enrolled_university, dtype: int64

In [11]:
job['gender'].value_counts()

Male      17729
Female     1238
Other       191
Name: gender, dtype: int64

In [12]:
job['enrolled_university'].fillna('no_enrollment', inplace =True)

In [13]:
job['education_level'].value_counts()

Graduate          11598
Masters            4361
High School        2017
Phd                 414
Primary School      308
Name: education_level, dtype: int64

In [14]:
job['education_level'].fillna('Graduate', inplace =True)

In [15]:
job['company_type'].value_counts()

Pvt Ltd                9817
Funded Startup         1001
Public Sector           955
Early Stage Startup     603
NGO                     521
Other                   121
Name: company_type, dtype: int64

In [16]:
job['company_type'].fillna('Pvt Ltd', inplace =True)

In [17]:
job['last_new_job'].value_counts()

1        8040
>4       3290
2        2900
never    2452
4        1029
3        1024
Name: last_new_job, dtype: int64

In [18]:
job['last_new_job'].fillna('1', inplace =True)

In [19]:
job['major_discipline'].fillna('Unknown', inplace = True)

In [20]:
cat_col = ['gender', 'relevent_experience', 'enrolled_university', 'education_level','major_discipline','company_type', 'last_new_job']

In [21]:
job_dummy = pd.get_dummies(job, columns=cat_col,drop_first=True)

In [22]:
job_dummy

Unnamed: 0,enrollee_id,city_development_index,experience,training_hours,target,gender_Male,gender_Other,relevent_experience_No relevent experience,enrolled_university_Part time course,enrolled_university_no_enrollment,...,company_type_Funded Startup,company_type_NGO,company_type_Other,company_type_Public Sector,company_type_Pvt Ltd,last_new_job_2,last_new_job_3,last_new_job_4,last_new_job_>4,last_new_job_never
0,8949,0.920,20.0,36,1,1,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
1,29725,0.776,15.0,47,0,1,0,1,0,1,...,0,0,0,0,1,0,0,0,1,0
2,11561,0.624,5.0,83,0,1,0,1,0,0,...,0,0,0,0,1,0,0,0,0,1
3,33241,0.789,0.0,52,1,1,0,1,0,1,...,0,0,0,0,1,0,0,0,0,1
4,666,0.767,20.0,8,0,1,0,0,0,1,...,1,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19153,7386,0.878,14.0,42,1,1,0,1,0,1,...,0,0,0,0,1,0,0,0,0,0
19154,31398,0.920,14.0,52,1,1,0,0,0,1,...,0,0,0,0,1,0,0,1,0,0
19155,24576,0.920,20.0,44,0,1,0,0,0,1,...,0,0,0,0,1,0,0,1,0,0
19156,5756,0.802,0.0,97,0,1,0,0,0,1,...,0,0,0,0,1,1,0,0,0,0


In [23]:
# Selecting the variables
Y = job_dummy[['target']]
X = job_dummy.drop(columns=['target','enrollee_id'])

In [24]:
# Splitting the data into train and test data sets.
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=42)

In [25]:
len(X_train), len(X_test), len(Y_train), len(Y_test)

(15326, 3832, 15326, 3832)

In [26]:
from sklearn.svm import SVC
model = SVC(random_state = 42)

In [27]:
# Training the model using train data
svc_model = model.fit(X_train, Y_train)

  y = column_or_1d(y, warn=True)


In [28]:
Y_test

Unnamed: 0,target
16144,0
13661,0
344,0
1034,0
8109,0
...,...
13955,0
13463,0
18772,0
12809,0


In [29]:
# Predicting the values using test data
Y_test['Prediction'] = svc_model.predict(X_test)

In [30]:
Y_test

Unnamed: 0,target,Prediction
16144,0,0
13661,0,0
344,0,0
1034,0,0
8109,0,0
...,...,...
13955,0,0
13463,0,0
18772,0,0
12809,0,0


In [31]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [32]:
print(confusion_matrix(Y_test['target'], Y_test['Prediction']))

[[2880    0]
 [ 952    0]]


In [33]:
print(accuracy_score(Y_test['target'], Y_test['Prediction']))

0.7515657620041754
