In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import scipy as sp
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif, VarianceThreshold
import plotly.express as px

In [3]:
career_dataset = pd.read_csv("../Datasets/career_pred.csv")

# 1.) Exploratory Data Analysis

In [4]:
career_dataset.head()

Unnamed: 0,Acedamic percentage in Operating Systems,percentage in Algorithms,Percentage in Programming Concepts,Percentage in Software Engineering,Percentage in Computer Networks,Percentage in Electronics Subjects,Percentage in Computer Architecture,Percentage in Mathematics,Percentage in Communication skills,Hours working per day,...,Interested Type of Books,Salary Range Expected,In a Realtionship?,Gentle or Tuff behaviour?,Management or Technical,Salary/work,hard/smart worker,worked in teams ever?,Introvert,Suggested Job Role
0,69,63,78,87,94,94,87,84,61,9,...,Prayer books,salary,no,stubborn,Management,salary,hard worker,yes,no,Database Developer
1,78,62,73,60,71,70,73,84,91,12,...,Childrens,salary,yes,gentle,Technical,salary,hard worker,no,yes,Portal Administrator
2,71,86,91,87,61,81,72,72,94,11,...,Travel,Work,no,stubborn,Management,work,hard worker,no,yes,Portal Administrator
3,76,87,60,84,89,73,62,88,69,7,...,Romance,Work,yes,gentle,Management,work,smart worker,yes,yes,Systems Security Administrator
4,92,62,90,67,71,89,73,71,73,4,...,Cookbooks,salary,no,stubborn,Management,work,hard worker,yes,yes,Business Systems Analyst


In [5]:
career_dataset.shape

(20000, 39)

In [6]:
career_dataset.columns

Index(['Acedamic percentage in Operating Systems', 'percentage in Algorithms',
       'Percentage in Programming Concepts',
       'Percentage in Software Engineering', 'Percentage in Computer Networks',
       'Percentage in Electronics Subjects',
       'Percentage in Computer Architecture', 'Percentage in Mathematics',
       'Percentage in Communication skills', 'Hours working per day',
       'Logical quotient rating', 'hackathons', 'coding skills rating',
       'public speaking points', 'can work long time before system?',
       'self-learning capability?', 'Extra-courses did', 'certifications',
       'workshops', 'talenttests taken?', 'olympiads',
       'reading and writing skills', 'memory capability score',
       'Interested subjects', 'interested career area ', 'Job/Higher Studies?',
       'Type of company want to settle in?',
       'Taken inputs from seniors or elders', 'interested in games',
       'Interested Type of Books', 'Salary Range Expected',
       'In a Rea

In [7]:
career_dataset.describe()

Unnamed: 0,Acedamic percentage in Operating Systems,percentage in Algorithms,Percentage in Programming Concepts,Percentage in Software Engineering,Percentage in Computer Networks,Percentage in Electronics Subjects,Percentage in Computer Architecture,Percentage in Mathematics,Percentage in Communication skills,Hours working per day,Logical quotient rating,hackathons,coding skills rating,public speaking points
count,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0
mean,77.0023,76.9482,77.01755,77.0945,76.9582,77.01555,77.06985,76.9131,76.92145,7.98595,4.99505,2.99265,5.00475,5.0128
std,10.085697,10.101733,10.134815,10.087837,10.020088,10.168888,10.069059,10.138555,10.103494,2.593798,2.578383,2.005791,2.576831,2.588875
min,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,4.0,1.0,0.0,1.0,1.0
25%,68.0,68.0,68.0,68.0,68.0,68.0,68.0,68.0,68.0,6.0,3.0,1.0,3.0,3.0
50%,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,8.0,5.0,3.0,5.0,5.0
75%,86.0,86.0,86.0,86.0,85.0,86.0,86.0,86.0,86.0,10.0,7.0,5.0,7.0,7.0
max,94.0,94.0,94.0,94.0,94.0,94.0,94.0,94.0,94.0,12.0,9.0,6.0,9.0,9.0


In [8]:
career_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 39 columns):
 #   Column                                    Non-Null Count  Dtype 
---  ------                                    --------------  ----- 
 0   Acedamic percentage in Operating Systems  20000 non-null  int64 
 1   percentage in Algorithms                  20000 non-null  int64 
 2   Percentage in Programming Concepts        20000 non-null  int64 
 3   Percentage in Software Engineering        20000 non-null  int64 
 4   Percentage in Computer Networks           20000 non-null  int64 
 5   Percentage in Electronics Subjects        20000 non-null  int64 
 6   Percentage in Computer Architecture       20000 non-null  int64 
 7   Percentage in Mathematics                 20000 non-null  int64 
 8   Percentage in Communication skills        20000 non-null  int64 
 9   Hours working per day                     20000 non-null  int64 
 10  Logical quotient rating                   2000

In [9]:
career_dataset.isnull().sum()

Acedamic percentage in Operating Systems    0
percentage in Algorithms                    0
Percentage in Programming Concepts          0
Percentage in Software Engineering          0
Percentage in Computer Networks             0
Percentage in Electronics Subjects          0
Percentage in Computer Architecture         0
Percentage in Mathematics                   0
Percentage in Communication skills          0
Hours working per day                       0
Logical quotient rating                     0
hackathons                                  0
coding skills rating                        0
public speaking points                      0
can work long time before system?           0
self-learning capability?                   0
Extra-courses did                           0
certifications                              0
workshops                                   0
talenttests taken?                          0
olympiads                                   0
reading and writing skills        

In [10]:
def cats_per_feature(feature):
  print("{}: {}".format(feature,career_dataset[feature].nunique()))

In [11]:
for feature in career_dataset.columns:
  cats_per_feature(feature)

Acedamic percentage in Operating Systems: 35
percentage in Algorithms: 35
Percentage in Programming Concepts: 35
Percentage in Software Engineering: 35
Percentage in Computer Networks: 35
Percentage in Electronics Subjects: 35
Percentage in Computer Architecture: 35
Percentage in Mathematics: 35
Percentage in Communication skills: 35
Hours working per day: 9
Logical quotient rating: 9
hackathons: 7
coding skills rating: 9
public speaking points: 9
can work long time before system?: 2
self-learning capability?: 2
Extra-courses did: 2
certifications: 9
workshops: 8
talenttests taken?: 2
olympiads: 2
reading and writing skills: 3
memory capability score: 3
Interested subjects: 10
interested career area : 6
Job/Higher Studies?: 2
Type of company want to settle in?: 10
Taken inputs from seniors or elders: 2
interested in games: 2
Interested Type of Books: 31
Salary Range Expected: 2
In a Realtionship?: 2
Gentle or Tuff behaviour?: 2
Management or Technical: 2
Salary/work: 2
hard/smart worke

In [12]:
cont_features = list(career_dataset.columns[0:14])

In [13]:
cat_features = career_dataset.iloc[:,:-1].drop(cont_features,axis=1).columns

In [14]:
cont_features

['Acedamic percentage in Operating Systems',
 'percentage in Algorithms',
 'Percentage in Programming Concepts',
 'Percentage in Software Engineering',
 'Percentage in Computer Networks',
 'Percentage in Electronics Subjects',
 'Percentage in Computer Architecture',
 'Percentage in Mathematics',
 'Percentage in Communication skills',
 'Hours working per day',
 'Logical quotient rating',
 'hackathons',
 'coding skills rating',
 'public speaking points']

In [15]:
cat_features

Index(['can work long time before system?', 'self-learning capability?',
       'Extra-courses did', 'certifications', 'workshops',
       'talenttests taken?', 'olympiads', 'reading and writing skills',
       'memory capability score', 'Interested subjects',
       'interested career area ', 'Job/Higher Studies?',
       'Type of company want to settle in?',
       'Taken inputs from seniors or elders', 'interested in games',
       'Interested Type of Books', 'Salary Range Expected',
       'In a Realtionship?', 'Gentle or Tuff behaviour?',
       'Management or Technical', 'Salary/work', 'hard/smart worker',
       'worked in teams ever?', 'Introvert'],
      dtype='object')

In [16]:
len(cont_features) + len(cat_features)

38

In [17]:
cont_career_dataset = career_dataset[cont_features]
cat_career_dataset = career_dataset[cat_features]

In [18]:
cont_career_dataset.head()

Unnamed: 0,Acedamic percentage in Operating Systems,percentage in Algorithms,Percentage in Programming Concepts,Percentage in Software Engineering,Percentage in Computer Networks,Percentage in Electronics Subjects,Percentage in Computer Architecture,Percentage in Mathematics,Percentage in Communication skills,Hours working per day,Logical quotient rating,hackathons,coding skills rating,public speaking points
0,69,63,78,87,94,94,87,84,61,9,4,0,4,8
1,78,62,73,60,71,70,73,84,91,12,7,1,2,3
2,71,86,91,87,61,81,72,72,94,11,1,4,1,3
3,76,87,60,84,89,73,62,88,69,7,1,1,2,5
4,92,62,90,67,71,89,73,71,73,4,5,4,6,3


In [19]:
cat_career_dataset.head()

Unnamed: 0,can work long time before system?,self-learning capability?,Extra-courses did,certifications,workshops,talenttests taken?,olympiads,reading and writing skills,memory capability score,Interested subjects,...,interested in games,Interested Type of Books,Salary Range Expected,In a Realtionship?,Gentle or Tuff behaviour?,Management or Technical,Salary/work,hard/smart worker,worked in teams ever?,Introvert
0,yes,yes,yes,shell programming,cloud computing,no,yes,excellent,excellent,cloud computing,...,no,Prayer books,salary,no,stubborn,Management,salary,hard worker,yes,no
1,yes,no,yes,machine learning,database security,no,no,poor,medium,networks,...,yes,Childrens,salary,yes,gentle,Technical,salary,hard worker,no,yes
2,yes,no,yes,app development,web technologies,no,yes,poor,excellent,hacking,...,yes,Travel,Work,no,stubborn,Management,work,hard worker,no,yes
3,no,yes,no,python,data science,yes,no,medium,excellent,networks,...,no,Romance,Work,yes,gentle,Management,work,smart worker,yes,yes
4,no,no,no,app development,cloud computing,no,no,poor,excellent,Computer Architecture,...,yes,Cookbooks,salary,no,stubborn,Management,work,hard worker,yes,yes


# 2.) Data Cleaning

### 2.a) Scaling Continuous features

In [20]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(cont_career_dataset)
cont_career_dataset = pd.DataFrame(scaled_data,columns=cont_career_dataset.columns)

In [21]:
cont_career_dataset.head()

Unnamed: 0,Acedamic percentage in Operating Systems,percentage in Algorithms,Percentage in Programming Concepts,Percentage in Software Engineering,Percentage in Computer Networks,Percentage in Electronics Subjects,Percentage in Computer Architecture,Percentage in Mathematics,Percentage in Communication skills,Hours working per day,Logical quotient rating,hackathons,coding skills rating,public speaking points
0,-0.79345,-1.380807,0.096941,0.98195,1.700806,1.670279,0.986229,0.699022,-1.575875,0.390962,-0.38593,-1.492042,-0.389927,1.153889
1,0.098925,-1.479803,-0.396421,-1.694608,-0.59464,-0.689921,-0.404204,0.699022,1.393469,1.547595,0.777619,-0.993473,-1.166093,-0.7775
2,-0.595145,0.896086,1.37968,0.98195,-1.59266,0.391837,-0.50352,-0.484608,1.690403,1.162051,-1.549479,0.502233,-1.554176,-0.7775
3,-0.099381,0.995082,-1.67916,0.684554,1.201796,-0.394896,-1.496687,1.093566,-0.78405,-0.380128,-1.549479,-0.993473,-1.166093,-0.004944
4,1.487064,-1.479803,1.281008,-1.000686,-0.59464,1.17857,-0.404204,-0.583244,-0.388138,-1.536762,0.00192,0.502233,0.38624,-0.7775


### 2.b) Label encoding categorical features

In [22]:
label_encoder = LabelEncoder()

In [23]:
def encode_feature(feature):
  cat_career_dataset[feature] = label_encoder.fit_transform(cat_career_dataset[feature])

In [24]:
for feature in cat_career_dataset.columns:
  encode_feature(feature)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [25]:
cat_career_dataset.head()

Unnamed: 0,can work long time before system?,self-learning capability?,Extra-courses did,certifications,workshops,talenttests taken?,olympiads,reading and writing skills,memory capability score,Interested subjects,...,interested in games,Interested Type of Books,Salary Range Expected,In a Realtionship?,Gentle or Tuff behaviour?,Management or Technical,Salary/work,hard/smart worker,worked in teams ever?,Introvert
0,1,1,1,8,0,0,1,0,0,4,...,0,21,1,0,1,0,0,0,1,0
1,1,0,1,5,2,0,0,2,1,7,...,1,5,1,1,0,1,0,0,0,1
2,1,0,1,0,7,0,1,2,0,6,...,1,29,0,0,1,0,1,0,0,1
3,0,1,0,6,1,1,0,1,0,7,...,0,23,0,1,0,0,1,1,1,1
4,0,0,0,0,0,0,0,2,0,0,...,1,7,1,0,1,0,1,0,1,1


### 2.c) Feature Selection

In [26]:
X = pd.concat([cont_career_dataset,cat_career_dataset],axis=1)
y = pd.DataFrame(career_dataset.iloc[:,-1],columns=['Suggested Job Role'])

In [27]:
X.head()

Unnamed: 0,Acedamic percentage in Operating Systems,percentage in Algorithms,Percentage in Programming Concepts,Percentage in Software Engineering,Percentage in Computer Networks,Percentage in Electronics Subjects,Percentage in Computer Architecture,Percentage in Mathematics,Percentage in Communication skills,Hours working per day,...,interested in games,Interested Type of Books,Salary Range Expected,In a Realtionship?,Gentle or Tuff behaviour?,Management or Technical,Salary/work,hard/smart worker,worked in teams ever?,Introvert
0,-0.79345,-1.380807,0.096941,0.98195,1.700806,1.670279,0.986229,0.699022,-1.575875,0.390962,...,0,21,1,0,1,0,0,0,1,0
1,0.098925,-1.479803,-0.396421,-1.694608,-0.59464,-0.689921,-0.404204,0.699022,1.393469,1.547595,...,1,5,1,1,0,1,0,0,0,1
2,-0.595145,0.896086,1.37968,0.98195,-1.59266,0.391837,-0.50352,-0.484608,1.690403,1.162051,...,1,29,0,0,1,0,1,0,0,1
3,-0.099381,0.995082,-1.67916,0.684554,1.201796,-0.394896,-1.496687,1.093566,-0.78405,-0.380128,...,0,23,0,1,0,0,1,1,1,1
4,1.487064,-1.479803,1.281008,-1.000686,-0.59464,1.17857,-0.404204,-0.583244,-0.388138,-1.536762,...,1,7,1,0,1,0,1,0,1,1


In [28]:
y.head()

Unnamed: 0,Suggested Job Role
0,Database Developer
1,Portal Administrator
2,Portal Administrator
3,Systems Security Administrator
4,Business Systems Analyst


In [29]:
y['Suggested Job Role'] = label_encoder.fit_transform(y['Suggested Job Role'])

In [30]:
y.head()

Unnamed: 0,Suggested Job Role
0,7
1,18
2,18
3,28
4,2


In [31]:
X.head()

Unnamed: 0,Acedamic percentage in Operating Systems,percentage in Algorithms,Percentage in Programming Concepts,Percentage in Software Engineering,Percentage in Computer Networks,Percentage in Electronics Subjects,Percentage in Computer Architecture,Percentage in Mathematics,Percentage in Communication skills,Hours working per day,...,interested in games,Interested Type of Books,Salary Range Expected,In a Realtionship?,Gentle or Tuff behaviour?,Management or Technical,Salary/work,hard/smart worker,worked in teams ever?,Introvert
0,-0.79345,-1.380807,0.096941,0.98195,1.700806,1.670279,0.986229,0.699022,-1.575875,0.390962,...,0,21,1,0,1,0,0,0,1,0
1,0.098925,-1.479803,-0.396421,-1.694608,-0.59464,-0.689921,-0.404204,0.699022,1.393469,1.547595,...,1,5,1,1,0,1,0,0,0,1
2,-0.595145,0.896086,1.37968,0.98195,-1.59266,0.391837,-0.50352,-0.484608,1.690403,1.162051,...,1,29,0,0,1,0,1,0,0,1
3,-0.099381,0.995082,-1.67916,0.684554,1.201796,-0.394896,-1.496687,1.093566,-0.78405,-0.380128,...,0,23,0,1,0,0,1,1,1,1
4,1.487064,-1.479803,1.281008,-1.000686,-0.59464,1.17857,-0.404204,-0.583244,-0.388138,-1.536762,...,1,7,1,0,1,0,1,0,1,1


In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify=y, random_state=1)

In [33]:
X_train.head()

Unnamed: 0,Acedamic percentage in Operating Systems,percentage in Algorithms,Percentage in Programming Concepts,Percentage in Software Engineering,Percentage in Computer Networks,Percentage in Electronics Subjects,Percentage in Computer Architecture,Percentage in Mathematics,Percentage in Communication skills,Hours working per day,...,interested in games,Interested Type of Books,Salary Range Expected,In a Realtionship?,Gentle or Tuff behaviour?,Management or Technical,Salary/work,hard/smart worker,worked in teams ever?,Introvert
18778,0.29723,-0.885831,-0.79111,0.387159,-0.59464,-0.689921,-0.80147,1.488109,0.997556,0.390962,...,1,13,1,1,1,1,1,1,1,0
16733,-1.289214,-0.390854,1.37968,-0.70329,-1.393056,1.17857,-0.80147,-0.287336,0.700622,-1.536762,...,0,25,0,0,0,1,1,1,0,1
18797,0.594689,0.797091,-0.593765,-0.70329,1.4014,0.096812,-0.006937,-0.385972,-1.080985,0.390962,...,1,22,1,0,1,0,1,0,1,0
7317,1.288758,0.995082,0.787646,1.57674,0.004172,0.391837,-0.304887,0.501751,1.294491,-1.151217,...,0,15,0,0,1,0,1,1,0,0
12707,-1.586673,-1.677794,-0.100404,-1.595476,-1.293254,-1.574995,0.688279,-1.175059,-1.080985,-0.765672,...,0,17,1,1,0,1,1,1,1,1


In [34]:
y_train.head()

Unnamed: 0,Suggested Job Role
18778,9
16733,26
18797,25
7317,20
12707,5


#### Seeing if there is any constant column

In [35]:
variance_threshold = VarianceThreshold(threshold=0)
vt_features = variance_threshold.fit(X_train)
vt_features.get_support()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True])

##### Getting the mutual information of the features

In [36]:
mutual_info = mutual_info_classif(X_train,y_train['Suggested Job Role'])
mutual_info

array([0.00508372, 0.        , 0.00775001, 0.        , 0.        ,
       0.0075857 , 0.00317222, 0.        , 0.00118916, 0.        ,
       0.01538252, 0.        , 0.        , 0.00160793, 0.        ,
       0.00066376, 0.        , 0.        , 0.        , 0.00287898,
       0.        , 0.        , 0.00148878, 0.        , 0.        ,
       0.        , 0.00095279, 0.        , 0.00326588, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.00232434,
       0.00865937, 0.        , 0.        ])

In [37]:
mutual_info = pd.Series(mutual_info)
mutual_info.index = X_train.columns
mutual_info.sort_values(ascending =False,inplace=True)

In [38]:
mutual_info

Logical quotient rating                     0.015383
hard/smart worker                           0.008659
Percentage in Programming Concepts          0.007750
Percentage in Electronics Subjects          0.007586
Acedamic percentage in Operating Systems    0.005084
interested in games                         0.003266
Percentage in Computer Architecture         0.003172
talenttests taken?                          0.002879
Salary/work                                 0.002324
public speaking points                      0.001608
memory capability score                     0.001489
Percentage in Communication skills          0.001189
Type of company want to settle in?          0.000953
self-learning capability?                   0.000664
Job/Higher Studies?                         0.000000
worked in teams ever?                       0.000000
Percentage in Software Engineering          0.000000
Management or Technical                     0.000000
Gentle or Tuff behaviour?                   0.

In [39]:
mutual_info_dataframe = pd.DataFrame(mutual_info, index=mutual_info.index, columns=['Mutual Information'])
mutual_info_dataframe

Unnamed: 0,Mutual Information
Logical quotient rating,0.015383
hard/smart worker,0.008659
Percentage in Programming Concepts,0.00775
Percentage in Electronics Subjects,0.007586
Acedamic percentage in Operating Systems,0.005084
interested in games,0.003266
Percentage in Computer Architecture,0.003172
talenttests taken?,0.002879
Salary/work,0.002324
public speaking points,0.001608


In [40]:
fig =px.bar(mutual_info_dataframe, x=mutual_info_dataframe.index,y='Mutual Information')
fig.show()

##### Here we see the features which have high infomation

In [41]:
important_features = mutual_info[0:11]

In [42]:
important_features

Logical quotient rating                     0.015383
hard/smart worker                           0.008659
Percentage in Programming Concepts          0.007750
Percentage in Electronics Subjects          0.007586
Acedamic percentage in Operating Systems    0.005084
interested in games                         0.003266
Percentage in Computer Architecture         0.003172
talenttests taken?                          0.002879
Salary/work                                 0.002324
public speaking points                      0.001608
memory capability score                     0.001489
dtype: float64

In [43]:
k_best_selector = SelectKBest(mutual_info_classif,k=11)
k_best_selector.fit(X_train,y_train['Suggested Job Role'])

SelectKBest(k=11,
            score_func=<function mutual_info_classif at 0x0000020C21B4B5E0>)

In [44]:
k_best_selector.get_support()

array([False,  True,  True, False, False, False,  True, False,  True,
       False, False, False,  True, False, False, False,  True,  True,
       False,  True, False, False, False,  True, False, False, False,
       False,  True, False, False, False, False, False, False, False,
        True, False])

In [45]:
feature_selected_by_kBest = X.columns[k_best_selector.get_support()]
feature_selected_by_kBest

Index(['percentage in Algorithms', 'Percentage in Programming Concepts',
       'Percentage in Computer Architecture',
       'Percentage in Communication skills', 'coding skills rating',
       'Extra-courses did', 'certifications', 'talenttests taken?',
       'Interested subjects', 'interested in games', 'worked in teams ever?'],
      dtype='object')

Index(['percentage in Algorithms', 'hackathons', 'public speaking points',
       'certifications', 'talenttests taken?', 'Interested subjects',
       'Type of company want to settle in?', 'interested in games',
       'Interested Type of Books', 'Salary Range Expected', 'Salary/work'],
      dtype='object')

In [46]:
X_train = X_train[important_features.index]

In [47]:
X_train.head()

Unnamed: 0,Logical quotient rating,hard/smart worker,Percentage in Programming Concepts,Percentage in Electronics Subjects,Acedamic percentage in Operating Systems,interested in games,Percentage in Computer Architecture,talenttests taken?,Salary/work,public speaking points,memory capability score
18778,-0.38593,1,-0.79111,-0.689921,0.29723,1,-0.80147,0,1,-1.163778,0
16733,0.00192,1,1.37968,1.17857,-1.289214,0,-0.80147,0,1,-0.391222,1
18797,-0.38593,0,-0.593765,0.096812,0.594689,1,-0.006937,0,1,1.153889,2
7317,-1.161629,1,0.787646,0.391837,1.288758,0,-0.304887,0,1,-0.004944,0
12707,-0.77378,1,-0.100404,-1.574995,-1.586673,0,0.688279,0,1,-0.004944,1


In [48]:
X_test = X_test[important_features.index]

# 3.) Model Preparation

In [49]:
dt_model = DecisionTreeClassifier()
rf_model = RandomForestClassifier()
svm_model = SVC()

In [50]:
dt_model.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}

In [51]:
rf_model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [52]:
svm_model.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [53]:
param_grid = {'bootstrap': [True,False],
 'criterion': ['gini'],
 'n_estimators': [5,10,15,20,25,30,40,50],
}

In [54]:
best_params = {}

In [55]:
grid_search_clf = GridSearchCV(rf_model,param_grid = param_grid, cv=5)

In [63]:
grid_search_clf.fit(X_train,y_train['Suggested Job Role'])

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'bootstrap': [True, False], 'criterion': ['gini'],
                         'n_estimators': [5, 10, 15, 20, 25, 30, 40, 50]})

In [64]:
pd.DataFrame(grid_search_clf.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_bootstrap,param_criterion,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.115693,0.018594,0.008965,0.001383,True,gini,5,"{'bootstrap': True, 'criterion': 'gini', 'n_es...",0.028358,0.031716,0.030224,0.038433,0.027612,0.031269,0.003859,9
1,0.212518,0.026808,0.016055,4.5e-05,True,gini,10,"{'bootstrap': True, 'criterion': 'gini', 'n_es...",0.030597,0.033209,0.026493,0.027239,0.029478,0.029403,0.002411,13
2,0.39106,0.042159,0.026926,0.006891,True,gini,15,"{'bootstrap': True, 'criterion': 'gini', 'n_es...",0.028731,0.027612,0.026119,0.029478,0.03209,0.028806,0.001994,14
3,0.394675,0.014555,0.022483,0.00609,True,gini,20,"{'bootstrap': True, 'criterion': 'gini', 'n_es...",0.028731,0.025746,0.033582,0.031343,0.023881,0.028657,0.003543,15
4,0.633956,0.087718,0.03653,0.010564,True,gini,25,"{'bootstrap': True, 'criterion': 'gini', 'n_es...",0.03806,0.032463,0.035821,0.026493,0.030597,0.032687,0.004037,1
5,1.318698,0.532892,0.088292,0.031789,True,gini,30,"{'bootstrap': True, 'criterion': 'gini', 'n_es...",0.030224,0.029478,0.034701,0.036567,0.024627,0.031119,0.004199,10
6,2.607351,0.118818,0.14636,0.012611,True,gini,40,"{'bootstrap': True, 'criterion': 'gini', 'n_es...",0.031716,0.03209,0.03209,0.026493,0.035821,0.031642,0.002979,6
7,3.007181,0.070811,0.176129,0.019599,True,gini,50,"{'bootstrap': True, 'criterion': 'gini', 'n_es...",0.031716,0.035075,0.032463,0.028731,0.032836,0.032164,0.002049,3
8,0.496353,0.046565,0.025978,0.004044,False,gini,5,"{'bootstrap': False, 'criterion': 'gini', 'n_e...",0.027239,0.025,0.032836,0.027612,0.037687,0.030075,0.004592,12
9,0.800937,0.381793,0.031398,0.018038,False,gini,10,"{'bootstrap': False, 'criterion': 'gini', 'n_e...",0.027239,0.025746,0.033209,0.025373,0.025,0.027313,0.003044,16


In [65]:
best_params = grid_search_clf.best_params_

In [66]:
best_params

{'bootstrap': True, 'criterion': 'gini', 'n_estimators': 25}

In [84]:
best_params = {'bootstrap': [True], 'criterion': ['gini'], 'n_estimators': [100]}

In [91]:
grid_search_clf_best = GridSearchCV(rf_model,param_grid = best_params, cv=100)
grid_search_clf_best.fit(X_train,y_train['Suggested Job Role'])

GridSearchCV(cv=100, estimator=RandomForestClassifier(),
             param_grid={'bootstrap': [True], 'criterion': ['gini'],
                         'n_estimators': [100]})

In [92]:
grid_search_clf_best.score(X_train,y_train['Suggested Job Role'])

1.0

In [93]:
grid_search_clf_best.score(X_test,y_test['Suggested Job Role'])

0.03333333333333333

In [69]:
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train,y_train)

DecisionTreeClassifier()

In [70]:
dt_prediction_train = dt_model.predict(X_train)
dt_prediction_test = dt_model.predict(X_test)

In [71]:
print("On Training Data: ",accuracy_score(y_train,dt_prediction_train))
print("On Test Data: ",accuracy_score(y_test,dt_prediction_test))

On Training Data:  1.0
On Test Data:  0.030757575757575758


In [72]:
# feature_selector = SelectKBest(score_func=chi2,k=11)
# feature_selector.fit(X_train,y_train)
# X_train_fs= feature_selector.transform(X_train)
# X_test_fs= feature_selector.transform(X_test)

In [73]:
# len(feature_selector.scores_)

In [74]:
# career_dataset.shape[1]

In [75]:
import xgboost as xgb

In [76]:
xgb_model = xgb.XGBClassifier()

In [77]:
 
xgb_model.fit(X_train, y_train)
xgb_y_pred  = xgb_model.predict(X_test)
xgb_accuracy = accuracy_score(y_test,xgb_y_pred)
print("accuracy=",xgb_accuracy*100)




A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



accuracy= 3.196969696969697


In [78]:
from sklearn.ensemble import RandomForestClassifier
rf_model  = RandomForestClassifier()

In [79]:
from sklearn.model_selection import GridSearchCV

In [80]:
n_estimators = [int(i) for i in np.linspace(10,80,10)]
max_features = ['auto','sqrt']
max_depth = [5,10]
min_sample_split = [5,10]
min_samples_leaf = [5,10]
bootstrap = [True,False]

In [81]:
param_grid = {
  'n_estimators' : n_estimators,
  'max_features': max_features,
  'max_depth': max_depth,
  'min_samples_split': min_sample_split,
  'min_samples_leaf': min_samples_leaf,
 'bootstrap': bootstrap
}

print(param_grid)

{'n_estimators': [10, 17, 25, 33, 41, 48, 56, 64, 72, 80], 'max_features': ['auto', 'sqrt'], 'max_depth': [5, 10], 'min_samples_split': [5, 10], 'min_samples_leaf': [5, 10], 'bootstrap': [True, False]}
