## Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.preprocessing import RobustScaler,StandardScaler,LabelEncoder
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import Perceptron,SGDClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score,f1_score,roc_auc_score
from sklearn.model_selection import cross_val_score,KFold,train_test_split,StratifiedKFold
sns.set(color_codes=True)
%matplotlib inline

## Dataset Attributes

* enrollee_id : Unique ID for enrollee
* city: City code
* citydevelopmentindex: Developement index of the city (scaled)
* gender: Gender of enrolee
* relevent_experience: Relevent experience of enrolee
* enrolled_university: Type of University course enrolled if any
* education_level: Education level of enrolee
* major_discipline :Education major discipline of enrolee
* experience: Enrolee total experience in years
* company_size: No of employees in current employer's company
* company_type : Type of current employer
* lastnewjob: Difference in years between previous job and current job
* training_hours: training hours completed
* target: 0 – Not looking for job change, 1 – Looking for a job change

In [None]:
df = pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_train.csv')
test = pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_test.csv')

**Vieiwng Training Dataset**

In [None]:
df.head()

**Vieiwng Test Dataset**

In [None]:
test.head()

In [None]:
df.info()

## Describing a Data

In [None]:
df.describe().T

## Shape of Dataset

In [None]:
df.shape,test.shape

## Unique Values

In [None]:
for i in df.columns:
    print('Unique Values in {} is {}'.format(i,len(df[i].unique())))

## Looking Missing Values

In [None]:
df.isnull().sum()

In [None]:
test.isnull().sum()

**we will handle missing values later after preprocessing of data**

## Removing Duplicate values

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.shape

There is no rows having same values

**Analysing Target on the Bases of Gender**

In [None]:
df.groupby('gender')['target'].value_counts()

The are more Number of Males which are not Looking for Job Change.

# Data Visualization

In [None]:
px.pie(data_frame=df,names = 'target',title='Visualising Targets')

In [None]:
sns.countplot(data=df,x=df.target)

**From the Above Two Plots We can see that data is imbalance and there are approx 75% number of employees which are not looking for job change.**

This Probelm can be solve using Over-Sampling or Under-Sampling.
I Highly recommend to look at this blog.

[Sampling](https://www.analyticsvidhya.com/blog/2017/03/imbalanced-data-classification/)

**Gender**

In [None]:
px.histogram(data_frame=df,x=df.gender,color='target')

we can say that the company is gender bias as there are more number of Male as compared to other genders.This plot also tell us that percentage of males that are more for **Not Looking for Job Change**

**Relevent Experience**

In [None]:
px.histogram(data_frame=df,x=df.relevent_experience,color='target')

In [None]:
plt.pie(data=df,x=df.relevent_experience.value_counts(),labels=['Has relevent experience','No relevent experience'],shadow=True,
       autopct='%1.1f%%');

72% employees has relevent experience and around 11k employees are not looking for job change.

**Enrolled University**

In [None]:
px.histogram(data_frame=df,x=df.enrolled_university,color_discrete_sequence=['indianred'])

**Education Level**

In [None]:
px.histogram(data_frame=df,x=df.education_level,color=df.relevent_experience)

More number of Graduates with Relevent Experience.
we can say that Company don't give more Chance to Freshers.

In [None]:
px.histogram(data_frame=df,x=df.experience,color='target')

In [None]:
px.histogram(data_frame=df,x=df.company_type)

In [None]:
px.histogram(data_frame=df,x=df.company_size)

In [None]:
px.histogram(data_frame=df,x=df.major_discipline)

**Training Hour Distribution**

In [None]:
sns.distplot(df.training_hours);

In [None]:
px.box(data_frame=df,x=df.training_hours,orientation='h')

Training Hour Column is Right Skwed and have **Outliers**.

## Feature Engineering

In [None]:
lb = LabelEncoder()

In [None]:
df.city = df.city.apply(lambda x: int(x.split('_')[1]))
test.city = test.city.apply(lambda x: int(x.split('_')[1]))

In [None]:
df.gender.isnull().sum()

In [None]:
df.gender = df.gender.fillna('Male')

In [None]:
df.isnull().sum()

In [None]:
df.enrolled_university = df.enrolled_university.fillna(df.enrolled_university.mode()[0])
df.education_level = df.education_level.fillna(method='ffill')
df.major_discipline = df.major_discipline.fillna(df.major_discipline.mode()[0])
df.experience = df.experience.fillna(method='ffill')
df.company_size = df.company_size.fillna(method='ffill')
df.company_type = df.company_type.fillna(df.company_type.mode()[0])
df.last_new_job = df.last_new_job.fillna(method='bfill')

In [None]:
df.dropna(inplace=True,axis=0)

In [None]:
test.gender = test.gender.fillna('Male')
test.enrolled_university = test.enrolled_university.fillna(test.enrolled_university.mode()[0])
test.education_level = test.education_level.fillna(method='ffill')
test.major_discipline = test.major_discipline.fillna(test.major_discipline.mode()[0])
test.experience = test.experience.fillna(method='ffill')
test.company_size = test.company_size.fillna(method='ffill')
test.company_type = test.company_type.fillna(test.company_type.mode()[0])
test.last_new_job = test.last_new_job.fillna(method='bfill')

In [None]:
df.isnull().sum()

In [None]:
test.isnull().sum()

therefore now there is no missing values in training and test dataset

In [None]:
df.gender = lb.fit_transform(df['gender'])
test.gender = lb.fit_transform(test['gender'])

In [None]:
df.relevent_experience = lb.fit_transform(df['relevent_experience'])
test.relevent_experience = lb.fit_transform(test['relevent_experience'])

In [None]:
df.enrolled_university = lb.fit_transform(df['enrolled_university'])
test.enrolled_university = lb.fit_transform(test['enrolled_university'])

In [None]:
df.head()

**To Avoid From Dummy Variable Trap we will use ```pd.get_dummies```**

### What is Dummy Variable Trap?

The Dummy variable trap is a scenario where there are attributes which are highly correlated (Multicollinear) and one variable predicts the value of others. When we use one hot encoding for handling the categorical data, then one dummy variable (attribute) can be predicted with the help of other dummy variables. Hence, one dummy variable is highly correlated with other dummy variables. Using all dummy variables in models lead to dummy variable trap. So, the models should be designed excluding one dummy variable.

**For Example –**

Let’s consider the case of gender having two values male (0 or 1) and female (1 or 0). Including both the dummy variable can cause redundancy because if a person is not male in such case that person is a female, hence, we don’t need to use both the variables in models. This will protect us from dummy variable trap.

In [None]:
df = pd.get_dummies(df,columns=['gender','enrolled_university','education_level','major_discipline','company_type'],drop_first=True)

In [None]:
test = pd.get_dummies(test,columns=['gender','enrolled_university','education_level','major_discipline','company_type'],drop_first=True)

In [None]:
experience = {'<1':0, '1':1, '2':2, '3':3, '4':4, '5':5, '6':6, '7':7, '8':8, '9':9, '10':10,
                      '11':11, '12':12, '13':13, '14':14, '15':15, '16':16, '17':17, '18':18, '19':19, '20':20, '>20':21}

df.experience = df.experience.map(experience)
test.experience = test.experience.map(experience)

In [None]:
company_size = {'<10':0, '10/49':1, '50-99':2, '100-500':3, '500-999':4, '1000-4999':5, '5000-9999':6, '10000+':7}
df.company_size = df.company_size.map(company_size)
test.company_size = test.company_size.map(company_size)

In [None]:
last_new_job = {'never':0, '1':1, '2':2, '3':3, '4':4, '>4':5}
df.last_new_job = df.last_new_job.map(last_new_job)
test.last_new_job = test.last_new_job.map(last_new_job)

## Splitting The DataSet

In [None]:
X = df.drop('target',axis=1)
Y = df['target']
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=42,stratify = df.target)

In [None]:
X_train.shape,X_test.shape,Y_train.shape,Y_test.shape

## Scaling The Data

**Scaling**

1. StandardScaler follows Standard Normal Distribution (SND). Therefore, it makes mean = 0 and scales the data to unit variance.

2. MinMaxScaler scales all the data features in the range [0, 1] or else in the range [-1, 1] if there are negative values in the dataset. This scaling compresses all the inliers in the narrow range [0, 0.005].
In the presence of outliers, StandardScaler does not guarantee balanced feature scales, due to the influence of the outliers while computing the empirical mean and standard deviation. This leads to the shrinkage in the range of the feature values.

3. By using RobustScaler(), we can remove the outliers and then use either StandardScaler or MinMaxScaler for preprocessing the dataset.

***How RobustScaler works:***

```
sklearn.preprocessing.RobustScaler(
with_centering=True,
with_scaling=True,
quantile_range=(25.0, 75.0),
copy=True,
)```

It scales features using statistics that are robust to outliers. This method removes the median and scales the data in the range between 1st quartile and 3rd quartile. i.e., in between 25th quantile and 75th quantile range. This range is also called an Interquartile range.

The median and the interquartile range are then stored so that it could be used upon future data using the transform method. If outliers are present in the dataset, then the median and the interquartile range provide better results and outperform the sample mean and variance.




In [None]:
rs = RobustScaler(
with_centering=True,
with_scaling=True,
quantile_range=(25.0, 75.0),
copy=True,
)
X_train = rs.fit_transform(X_train)
X_test = rs.transform(X_test)

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
test = rs.transform(test)
test = sc.transform(test)

## Model Building

In [None]:
rf = RandomForestClassifier(n_estimators=300)
rf.fit(X_train,Y_train)
pred_rf = rf.predict(X_test)
roc_auc_score(Y_test,pred_rf)

In [None]:
sgd = SGDClassifier()
sgd.fit(X_train,Y_train)
pred_sgd = sgd.predict(X_test)
roc_auc_score(Y_test,pred_sgd)

In [None]:
xg = XGBClassifier(learning_rate =0.1, n_estimators=494, max_depth=5,subsample = 0.70, verbosity = 0,
                                            scale_pos_weight = 2.5,updater ="grow_histmaker",base_score  = 0.2)
xg.fit(X_train,Y_train)
pred_xg = xg.predict(X_test)
roc_auc_score(Y_test,pred_xg)

In [None]:
lgb = LGBMClassifier(subsample_freq = 2, objective ="binary",importance_type = "gain",
                                             verbosity = -1, max_bin = 60,num_leaves = 300,
                                             boosting_type = 'dart',learning_rate=0.15, 
                                             n_estimators=494, max_depth=5, scale_pos_weight=2.5)
lgb.fit(X_train,Y_train)
pred_lgb = lgb.predict(X_test)
roc_auc_score(Y_test,pred_lgb)

In [None]:
cat = CatBoostClassifier(learning_rate=0.15, n_estimators=494, subsample=0.085, 
                                                 max_depth=5, scale_pos_weight=2.5)
cat.fit(X_train,Y_train)
pred_cat = cat.predict(X_test)
roc_auc_score(Y_test,pred_cat)

In [None]:
test_final = cat.predict_proba(test)

In [None]:
sample = pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/sample_submission.csv')
sample.head(1)

In [None]:
sample['target'] = test_final

## Submission

In [None]:
sample.to_csv('submit.csv',index= False)

# If You Like This Kernel Then Upvote😊

## Drop your suggestions in the comment box Related to Roc Score or EDA