In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [2]:
df = pd.read_csv(r"D:\Innomatics\8.Machine Learning Module\Projects\Datasets\Employment_Data.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73462 entries, 0 to 73461
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      73462 non-null  int64  
 1   Age             73462 non-null  object 
 2   Accessibility   73462 non-null  object 
 3   EdLevel         73462 non-null  object 
 4   Employment      73462 non-null  int64  
 5   Gender          73462 non-null  object 
 6   MentalHealth    73462 non-null  object 
 7   MainBranch      73462 non-null  object 
 8   YearsCode       73462 non-null  int64  
 9   YearsCodePro    73462 non-null  int64  
 10  Country         73462 non-null  object 
 11  PreviousSalary  73462 non-null  float64
 12  HaveWorkedWith  73399 non-null  object 
 13  ComputerSkills  73462 non-null  int64  
 14  Employed        73462 non-null  int64  
dtypes: float64(1), int64(6), object(8)
memory usage: 8.4+ MB


In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,Age,Accessibility,EdLevel,Employment,Gender,MentalHealth,MainBranch,YearsCode,YearsCodePro,Country,PreviousSalary,HaveWorkedWith,ComputerSkills,Employed
0,0,<35,No,Master,1,Man,No,Dev,7,4,Sweden,51552.0,C++;Python;Git;PostgreSQL,4,0
1,1,<35,No,Undergraduate,1,Man,No,Dev,12,5,Spain,46482.0,Bash/Shell;HTML/CSS;JavaScript;Node.js;SQL;Typ...,12,1
2,2,<35,No,Master,1,Man,No,Dev,15,6,Germany,77290.0,C;C++;Java;Perl;Ruby;Git;Ruby on Rails,7,0
3,3,<35,No,Undergraduate,1,Man,No,Dev,9,6,Canada,46135.0,Bash/Shell;HTML/CSS;JavaScript;PHP;Ruby;SQL;Gi...,13,0
4,4,>35,No,PhD,0,Man,No,NotDev,40,30,Singapore,160932.0,C++;Python,2,0


In [5]:
df = df.drop(columns = ["Unnamed: 0"])

In [6]:
df.isnull().sum()

Age                0
Accessibility      0
EdLevel            0
Employment         0
Gender             0
MentalHealth       0
MainBranch         0
YearsCode          0
YearsCodePro       0
Country            0
PreviousSalary     0
HaveWorkedWith    63
ComputerSkills     0
Employed           0
dtype: int64

In [7]:
df["HaveWorkedWith"] = df["HaveWorkedWith"].fillna("Unknown")

In [8]:
df["HaveWorkedWith"] = df["HaveWorkedWith"].str.split(";")

In [9]:
from sklearn.preprocessing import MultiLabelBinarizer
import pickle
skills_list = df['HaveWorkedWith'].tolist()

# Fit MultiLabelBinarizer
mlb_computerskills = MultiLabelBinarizer()
mlb_computerskills.fit(skills_list)

# Save MultiLabelBinarizer object
with open("mlb_computerskills.pkl", "wb") as f:
    pickle.dump(mlb_computerskills, f)

In [10]:

# Assuming 'skills' column already contains lists like: ['Python', 'SQL']
skills_column = df['HaveWorkedWith'].tolist()

# Initialize and fit MultiLabelBinarizer
mlb_skills = MultiLabelBinarizer()
mlb_skills.fit(skills_column)

# Save mlb_skills object
with open("mlb_skills.pkl", "wb") as f:
    pickle.dump(mlb_skills, f)


In [11]:
mlb = MultiLabelBinarizer()
df = df.join(pd.DataFrame(mlb.fit_transform(df.pop('HaveWorkedWith')),
                          columns=mlb.classes_,
                          index=df.index))

In [12]:
df.shape

(73462, 130)

In [13]:
df["Country"].nunique()

172

In [14]:
df = df.drop(columns = ["Country"])

## **Selecting Target Variable and Predictors:**

In [15]:
y = df["Employed"]
X = df.drop(columns = "Employed")
print(y.shape)
print(X.shape)

(73462,)
(73462, 128)


## **Feature Selection:** 

In [16]:
cols = X.columns.tolist()
cat_cols = ["Age","Accessibility","EdLevel","Gender","MentalHealth","MainBranch"]
num_cols = ["YearsCode","YearsCodePro","PreviousSalary","ComputerSkills"]
for i in cat_cols:
    cols.remove(i)
for i in num_cols:
    cols.remove(i)
cat_cols.extend(num_cols)
cat_cols.extend(cols)

In [17]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler,OrdinalEncoder
transformer = ColumnTransformer(transformers = [("Ordinal_Encoding",OrdinalEncoder(),[0,1,2,4,5,6]),
                                                ("MinMax_Scaler",MinMaxScaler(),[7,8,9,10])],remainder = "passthrough")
X_trans = transformer.fit_transform(X)
X_trans = pd.DataFrame(X_trans,columns = cat_cols)

In [18]:
# Mutual Information for Classification

from sklearn.feature_selection import mutual_info_classif
mutual_info = mutual_info_classif(X_trans,y)
mutual_info = pd.Series(mutual_info)
mutual_info.index = X_trans.columns
mutual_info.sort_values(ascending  = False)

ComputerSkills    0.246862
Node.js           0.135795
TypeScript        0.123707
JavaScript        0.101844
npm               0.072112
                    ...   
Phoenix           0.000000
Accessibility     0.000000
Pulumi            0.000000
Crystal           0.000000
Play Framework    0.000000
Length: 128, dtype: float64

In [19]:
X = X.drop(columns = ["YearsCodePro","Accessibility","Gender"])

## **Splitting Data into Train and Test Data:**

In [20]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)

In [21]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(58769, 125)
(14693, 125)
(58769,)
(14693,)


In [22]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder,StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB

transformer = ColumnTransformer(transformers = [("Ordinal_encoding",OrdinalEncoder(),[0,1,3,4]),
                                                ("Standard_Scaling",StandardScaler(),[5,6,7])],
                                                    remainder = "passthrough")

In [23]:
nb_pipeline = Pipeline(steps = [("Transformer",transformer),
                                ("GaussianNB",GaussianNB())])

In [24]:
nb_pipeline.fit(X_train,y_train)

In [25]:
y_pred = nb_pipeline.predict(X_test)

In [26]:
# Let us Validate the Model
# Let us use accuracy_score
from sklearn.metrics import accuracy_score
score = accuracy_score(y_test,y_pred)
print(score)

0.7716599741373443


In [27]:
import pickle
with open("nb_pipeline.pkl","wb") as f:
    pickle.dump(nb_pipeline,f)