Features

● enrollee_id : Unique ID for candidate

● city: City code

● city_ development _index : Developement index of the city (scaled)

● gender: Gender of candidate

● relevent_experience: Relevant experience of candidate

● enrolled_university: Type of University course enrolled if any

● education_level: Education level of candidate

● major_discipline :Education major discipline of candidate

● experience: Candidate total experience in years

● company_size: No of employees in current employer's company

● company_type : Type of current employer

● last_new_job: Difference in years between previous job and current job

● training_hours: training hours completed

● target: 0 – Not looking for job change, 1 – Looking for a job change

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns


In [None]:
data = pd.read_csv('/content/drive/MyDrive/DS_Job_Change_Data.csv')

In [None]:
data.head()

In [None]:
df=data.copy()

In [None]:
# rename the columns

df=df.rename(columns= {'enrollee_id':'id','relevent_experience':'rel_exp', 'enrolled_university':'enr_univ','education_level':'ed_lev', 'major_discipline':'maj_dis','experience':'exp','company_size':'csize','company_type':'ctype','last_new_job':'lnjob','training_hours':'train_hour','city_development_index':'index'})



In [None]:
df.columns

# Inspecting and cleaning the data

In [None]:
df.info() # most of the data object

In [None]:
def inspection(dataframe):
  print('Tipe of variables we are working with')
  print(dataframe.dtypes)

  print('total samples with missing values:')

  print(df.isnull().any(axis=1).sum())

  print('Total missing values per Variables')
  print(df.isnull().sum())
  print('Map of missing values')
  sns.heatmap(dataframe.isnull())


In [None]:
inspection(df)

In [None]:
# there is lots of missing values
# Drop small numbers of missing values in the columns: 'enr_univ', 'ed_lev', 'exp', 'lnjob'
df.dropna(subset=['enr_univ', 'ed_lev', 'exp', 'lnjob'], inplace=True)

In [None]:
# rest of missing values we will imput with mode
df['gender'].fillna(df['gender'].mode()[0], inplace=True)

df['maj_dis'].fillna(df['maj_dis'].mode()[0], inplace=True)

df['ctype'].fillna(df['ctype'].mode()[0], inplace=True)

df['csize'].fillna(df['csize'].mode()[0], inplace=True)

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.corr()

In [None]:
df.gender.unique()

# Enoding categorical variables

In [None]:
from sklearn.preprocessing import OrdinalEncoder


In [None]:
def encode_categories(df,variables):
  ord_enc=OrdinalEncoder()
  for i in variables:
    name=i+'_code'
    df[name]=ord_enc.fit_transform(df[[i]])
    print('The encoded values for ' + i + ' are:')
    print(df[name].unique())


In [None]:
encode_categories(df,['city', 'rel_exp','enr_univ', 'ed_lev',
       'maj_dis', 'exp','ctype', 'lnjob','gender'])

In [None]:
df.head()

In [None]:
df.columns

# Visualization


In [None]:
def plot_scatterplot (df, col_to_exclude, class_col):
  cols=df.select_dtypes(include=np.number).columns.tolist()
  X=df[cols]
  X=X[X.columns.difference(col_to_exclude)]
  for col in X.columns.difference([class_col]):
    g=sns.FacetGrid(df)
    g.map(sns.scatterplot,col,class_col)


In [None]:
plot_scatterplot(df,['id'],'target')

In [None]:
def full_diagnostic (df, class_col, col_to_exclude):
  cols=df.select_dtypes(include=np.number).columns.tolist()
  X=df[cols]
  X=X[X.columns.difference([col_to_exclude])]
  X=X[X.columns.difference([class_col])]
  sns.pairplot(df,hue=class_col)

In [None]:
# full_diagnostic(df,class_col='target',col_to_exclude='id')

# Logistic regression

In [None]:
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler


In [None]:
def logistic_regression (df, class_col, col_to_exclude):
  cols=df.select_dtypes(include=np.number).columns.tolist()
  X=df[cols]
  X=X[X.columns.difference([class_col])]
  X=X[X.columns.difference(col_to_exclude)]

  # Scalling variables

  scaler=preprocessing.StandardScaler()
  X_scaled = scaler.fit_transform(X)


  y = df[class_col]
  logit_model=sm.Logit(y,X)
  result=logit_model.fit()
  print(result.summary2())


In [None]:

logistic_regression(df,class_col='target',col_to_exclude=['id','train_hour','city_code'])

In [None]:
# Seems like train_hours and city_code not significant values we can remove those values

In [None]:
# coeff represent for one unit for variation in variable how much the log of the odds of Churning or not, changed.

In [None]:
import math

In [None]:
math.exp(-0.1195) # an aditional age increases the odds of churn by 0.88

# Run the ML Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report
from sklearn import config_context


In [None]:
def prepare_model(df, class_col, col_to_exclude):
  cols=df.select_dtypes(include=np.number).columns.tolist()
  X=df[cols]
  X=X[X.columns.difference([class_col])]
  X=X[X.columns.difference(col_to_exclude)]

  y=df[class_col]
  global X_train,X_test,y_train,y_test # this allow us to call these variables outside this function
  X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0)



In [None]:
def run_model(X_train,X_test,y_train,y_test):
  global logreg # define the logistic model as global model that can be used outside of this function
  # fitting the logistic regression
  logreg = LogisticRegression(random_state=13)
  logreg.fit(X_train,y_train)
  # predicting y values
  global y_pred # define y_pred as aglobal variable that can be used ouside of this function
  y_pred=logreg.predict(X_test)
  logit_roc_auc=roc_auc_score(y_test,logreg.predict(X_test))
  print(classification_report(y_test, y_pred))
  print('The area unedr the curve is: %0.2f'%logit_roc_auc)



In [None]:

prepare_model(df,class_col='target',col_to_exclude=['id','train_hour','city_code'])

#prepare_model(df,class_col='target',col_to_exclude=[])


In [None]:
run_model(X_train,X_test,y_train,y_test)

In [None]:
from sklearn.metrics import confusion_matrix as sklearn_confusion_matrix

def print_confusion_matrix(y_test, y_pred):
    cm = sklearn_confusion_matrix(y_test, y_pred)
    print(cm)

    tn, fp, fn, tp = cm.ravel()
    print('TN:%0.2f' % tn)
    print('TP:%0.2f' % tp)
    print('FN:%0.2f' % fn)
    print('FP:%0.2f' % fp)

print_confusion_matrix(y_test, y_pred)


In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt

def plot_roc_curve(logreg, X_test, y_test):
    # Calculate predicted probabilities
    y_pred_proba = logreg.predict_proba(X_test)[:,1]

    # Calculate ROC curve
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)

    # Calculate AUC score
    logit_roc_auc = roc_auc_score(y_test, y_pred_proba)

    # Plot ROC curve
    plt.figure()
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.plot([0, 1], [0, 1], 'b--')
    plt.plot(fpr, tpr, color='darkorange', label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend(loc='lower right')
    plt.show()




In [None]:

plot_roc_curve(logreg, X_test, y_test)

In [None]:
# There is not a good model

# Dealing with Imbalancee Class

In [None]:
df['target'].describe()

In [None]:
# hyperparameter turning to balance the data using '"class_weight='balanced"'

In [None]:
# class imbalance method 1
def run_model_bweights(X_train,X_test,y_train,y_test):
  global logreg # define the logistic model as global model that can be used outside of this function
  # fitting the logistic regression
  logreg = LogisticRegression(random_state=13, class_weight='balanced')
  logreg.fit(X_train,y_train)
  # predicting y values
  global y_pred # define y_pred as aglobal variable that can be used ouside of this function
  y_pred=logreg.predict(X_test)
  logit_roc_auc=roc_auc_score(y_test,logreg.predict(X_test))
  print(classification_report(y_test, y_pred))
  print('The area unedr the curve is: %0.2f'%logit_roc_auc)

In [None]:
run_model_bweights(X_train,X_test,y_train,y_test)

In [None]:
# class imbalance method 2
def run_model_aweights(X_train,X_test,y_train,y_test,w):
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import roc_auc_score,classification_report
    global logreg
    logreg = LogisticRegression(random_state = 13,class_weight=w) # define class_weight parameter
    logreg.fit(X_train, y_train) # fit the model
    global y_pred
    y_pred = logreg.predict(X_test) # predict on test data
    logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))  # ROC AUC score
    print(classification_report(y_test, y_pred))
    print("The area under the curve is: %0.2f"%logit_roc_auc)  # AUC curve

In [None]:
run_model_aweights(X_train,X_test,y_train,y_test,{0:90, 1:10})

In [None]:
# Also we can resample our dataset

In [None]:
from sklearn.utils import resample

In [None]:
# class imbalance method 3
def adjust_imbalance(X_train,y_train,class_col):
  X = pd.concat([X_train,y_train],axis=1)

#separate the 2 clases
  class0=X[X[class_col]==0]
  class1=X[X[class_col]==1]

# case 1 - bootstraps from the minority class

  if len(class1)<len(class0):
    resampled=resample(class1,replace=True,n_samples=len(class0),random_state=10)

    resampled_df = pd.concat([resampled,class0])

# case 1 - Resample from the majority class

  else:
    resampled = resample(class1,replace=False,n_samples=len(class0),random_state=10)
    resampled_df = df.concat([resampled,class0])
  return resampled_df


In [None]:
resampled_df = adjust_imbalance(X_train,y_train,class_col = 'target')

In [None]:
prepare_model(resampled_df,class_col='target',col_to_exclude=['id','train_hour','city_code'])

In [None]:
run_model(X_train,X_test,y_train,y_test)

In [None]:
# Smoote approach

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
def prepare_model_smoote(df, class_col, col_to_exclude):
  # Synthetic Minority Oversampling approach . Generate new instances from existing minority casdes that you supply as input
  cols=df.select_dtypes(include=np.number).columns.tolist()
  X=df[cols]
  X=X[X.columns.difference([class_col])]
  X=X[X.columns.difference(col_to_exclude)]

  y=df[class_col]
  global X_train,X_test,y_train,y_test # this allow us to call these variables outside this function
  X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0)
  sm = SMOTE(random_state=0)
  X_train,y_train = sm.fit_resample(X_train,y_train)




In [None]:
prepare_model_smoote(df,class_col='target',col_to_exclude=['id','train_hour','city_code'])

In [None]:
run_model(X_train,X_test,y_train,y_test)


# Feature selection

In [None]:
class_col = 'target'
cols_to_exclude=['id']

# function for feature selection
def var_threshold_selection(df,cols_to_exclude,class_col,threshold):
  from sklearn.feature_selection import VarianceThreshold
  import numpy as np
  from sklearn import preprocessing

  cols=df.select_dtypes(include=np.number).columns.tolist() #finding all the numerical columns from the dataframe
  X=df[cols] #creating a dataframe only with the numerical columns
  X = X[X.columns.difference(cols_to_exclude)] #columns to exclude
  X = X[X.columns.difference([class_col])]
  ## Scaling variables
  scaler = preprocessing.StandardScaler().fit(X)
  X_scaled = scaler.transform(X)
  var_thr = VarianceThreshold(threshold = threshold) #Removing both constant and quasi-constant
  var_thr.fit(X_scaled)
  var_thr.get_support()

  global selected_cols
  selected_cols = X.columns[var_thr.get_support()]

  print("The selected features are: ")
  print(list(selected_cols))

In [None]:
var_threshold_selection(df,cols_to_exclude=['id'],class_col = 'target',threshold=1)

In [None]:
prepare_model(resampled_df,class_col='target',col_to_exclude=['id','train_hour',
                                                              'maj_dis_code', 'exp_code', 'ctype_code',
                                                              'lnjob_code','rel_exp_code', 'enr_univ_code'])

In [None]:
run_model(X_train,X_test,y_train,y_test)

In [None]:
# RFE for feature selection
def rfe_selection(df,cols_to_exclude,class_col,model):
  import warnings
  warnings.filterwarnings("ignore")
  import numpy as np
  from sklearn.feature_selection import RFE

  cols=df.select_dtypes(include=np.number).columns.tolist() #finding all the numerical columns from the dataframe
  X=df[cols] #creating a dataframe only with the numerical columns
  X = X[X.columns.difference(cols_to_exclude)] #columns to exclude
  X = X[X.columns.difference([class_col])]
  y = df[class_col]

  rfe = RFE(model)
  rfe = rfe.fit(X, y) # fit the model
  global selected_cols
  selected_cols = X.columns[rfe.support_]

  print("The selected features are: ")
  print(list(selected_cols))

In [None]:
rfe_selection(df,class_col = 'target',cols_to_exclude=['id'],model=logreg)

In [None]:
prepare_model(resampled_df,class_col = 'target',col_to_exclude=['id','index', 'train_hour', 'city_code',
                                                                 'maj_dis_code','exp_code','ctype_code', 'lnjob_code'])

run_model(X_train,X_test,y_train,y_test)

In [None]:
# Select only numeric columns
numeric_columns = df.select_dtypes(include='number').columns

numeric_columns


#Saving & Running the Model

In [None]:
# save the model using pickle function
import pickle
pickle.dump(logreg, open('model1.pkl', 'wb'))

In [None]:
# load the saved model
model = pickle.load(open('model1.pkl', 'rb'))

In [None]:
# make predictions on the test data
model.predict(X_test)

#Decision tree

In [None]:
from sklearn.tree import DecisionTreeClassifier


In [None]:
def prepare_model_smoote(df, class_col, col_to_exclude):
  # Synthetic Minority Oversampling approach . Generate new instances from existing minority casdes that you supply as input
  cols=df.select_dtypes(include=np.number).columns.tolist()
  X=df[cols]
  X=X[X.columns.difference([class_col])]
  X=X[X.columns.difference(col_to_exclude)]

  y=df[class_col]
  global X_train,X_test,y_train,y_test # this allow us to call these variables outside this function
  X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0)
  sm = SMOTE(random_state=0)
  X_train,y_train = sm.fit_resample(X_train,y_train)

In [None]:
prepare_model_smoote(df,class_col='target',col_to_exclude=['id','train_hour','city_code'])

In [None]:
def run_model(X_train,X_test,y_train,y_test):
  global dectree# define the logistic model as global model that can be used outside of this function
  # fitting the logistic regression
  dectree = DecisionTreeClassifier(random_state=13,criterion='entropy')
  dectree.fit(X_train,y_train)
  # predicting y values
  global y_pred # define y_pred as aglobal variable that can be used ouside of this function
  y_pred=dectree.predict(X_test)
  dectree_roc_auc=roc_auc_score(y_test,dectree.predict(X_test))
  print(classification_report(y_test, y_pred))
  print('The area unedr the curve is: %0.2f'%dectree_roc_auc)

In [None]:
run_model(X_train,X_test,y_train,y_test)

In [None]:
def plot_roc_curve(dectree, X_test, y_test):
    # Calculate predicted probabilities
    y_pred_proba = dectree.predict_proba(X_test)[:,1]

    # Calculate ROC curve
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)

    # Calculate AUC score
    dectree_roc_auc = roc_auc_score(y_test, y_pred_proba)

    # Plot ROC curve
    plt.figure()
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.plot([0, 1], [0, 1], 'b--')
    plt.plot(fpr, tpr, color='darkorange', label='Decision tree (area = %0.2f)' % dectree_roc_auc)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend(loc='lower right')
    plt.show()

In [None]:
plot_roc_curve(dectree,X_test,y_test)

In [None]:
from sklearn.metrics import confusion_matrix as sklearn_confusion_matrix

def print_confusion_matrix(y_test, y_pred):
    cm = sklearn_confusion_matrix(y_test, y_pred)
    print(cm)

    tn, fp, fn, tp = cm.ravel()
    print('TN:%0.2f' % tn)
    print('TP:%0.2f' % tp)
    print('FN:%0.2f' % fn)
    print('FP:%0.2f' % fp)

print_confusion_matrix(y_test, y_pred)

#Visualizing decision tree

In [None]:
from sklearn.tree import plot_tree

In [None]:
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree

def plot_model(model, class_names, max_depth=None, figsize=(20,20), fontsize=1):
    plt.figure(figsize=figsize)
    plot_tree(model,
              feature_names=model.feature_names_in_,
              class_names=class_names,
              fontsize=fontsize,
              max_depth=max_depth,
              filled=True)
    plt.show()

plot_model(dectree, 'target')


In [None]:
plot_model(dectree, 'target', max_depth=2,figsize=(20,20),fontsize=10)

In [None]:
def plot_feature_importances(model):
  feature_importances = pd.Series(model.feature_importances_, index=model.feature_names_in_)
  feature_importances = feature_importances.sort_values(axis=0, ascending=False)
  fig, ax = plt.subplots()
  feature_importances.plot.bar()
  ax.set_title("Feature importances")
  fig.tight_layout()

In [None]:
plot_feature_importances(dectree)