In [1]:
import numpy as np, pandas as pd
import matplotlib.pyplot as  plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
import os

In [2]:
#reading data
df = pd.read_csv('roo_data.csv')
df.head()

Unnamed: 0,Acedamic percentage in Operating Systems,percentage in Algorithms,Percentage in Programming Concepts,Percentage in Software Engineering,Percentage in Computer Networks,Percentage in Electronics Subjects,Percentage in Computer Architecture,Percentage in Mathematics,Percentage in Communication skills,Hours working per day,...,Interested Type of Books,Salary Range Expected,In a Realtionship?,Gentle or Tuff behaviour?,Management or Technical,Salary/work,hard/smart worker,worked in teams ever?,Introvert,Suggested Job Role
0,69,63,78,87,94,94,87,84,61,9,...,Prayer books,salary,no,stubborn,Management,salary,hard worker,yes,no,Database Developer
1,78,62,73,60,71,70,73,84,91,12,...,Childrens,salary,yes,gentle,Technical,salary,hard worker,no,yes,Portal Administrator
2,71,86,91,87,61,81,72,72,94,11,...,Travel,Work,no,stubborn,Management,work,hard worker,no,yes,Portal Administrator
3,76,87,60,84,89,73,62,88,69,7,...,Romance,Work,yes,gentle,Management,work,smart worker,yes,yes,Systems Security Administrator
4,92,62,90,67,71,89,73,71,73,4,...,Cookbooks,salary,no,stubborn,Management,work,hard worker,yes,yes,Business Systems Analyst


#### We need further descriptive stats to analyse the data

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 39 columns):
 #   Column                                    Non-Null Count  Dtype 
---  ------                                    --------------  ----- 
 0   Acedamic percentage in Operating Systems  20000 non-null  int64 
 1   percentage in Algorithms                  20000 non-null  int64 
 2   Percentage in Programming Concepts        20000 non-null  int64 
 3   Percentage in Software Engineering        20000 non-null  int64 
 4   Percentage in Computer Networks           20000 non-null  int64 
 5   Percentage in Electronics Subjects        20000 non-null  int64 
 6   Percentage in Computer Architecture       20000 non-null  int64 
 7   Percentage in Mathematics                 20000 non-null  int64 
 8   Percentage in Communication skills        20000 non-null  int64 
 9   Hours working per day                     20000 non-null  int64 
 10  Logical quotient rating                   2000

### Some of the inferences from the info() are as follows
> --We have a total of *39* features(0 - 38).<br/> 
> --Total 20,000 instances and none of the features cotaining null values.<br/>
> --The dataset has both type of features :- *Nominal* as well as *Numeric*.

In [4]:
feature_nominal=[]
feature_numeric=[]
for feat in df.columns:
    if df[feat].dtype==object:
        feature_nominal.append(feat)
    else:
        feature_numeric.append(feat)

def print_feat(nominal, numeric):
    
    print("The nominal features are as follows:")
    for val in nominal:
        print(val)
    print("\n")
    print("The numeric features are as follows:")
    for val in numeric:
        print(val)
    return

In [5]:
#print_feat(feature_nominal, feature_numeric)
df['Suggested Job Role'].value_counts()

Network Security Administrator               1112
Network Security Engineer                     630
Network Engineer                              621
Project Manager                               602
Database Administrator                        593
Portal Administrator                          593
Information Technology Manager                591
Software Engineer                             590
UX Designer                                   589
Design & UX                                   588
Software Developer                            587
CRM Business Analyst                          584
Business Systems Analyst                      582
Database Developer                            581
Solutions Architect                           578
Software Systems Engineer                     575
Software Quality Assurance (QA) / Testing     571
Database Manager                              570
Web Developer                                 570
CRM Technical Developer                       567


## `Let's see how the model performs without clubbing the target classes or removing irrelevant features`

In [6]:
df1=df

### Target Feature/Column is `Suggested Job Role`

#### Let's convert the nominal fetaures in `feature_nominal` to numeric 

In [7]:
for feature in feature_nominal:
    encode_feature = LabelEncoder()
    df1[feature] = encode_feature.fit_transform(df[feature])

In [8]:
df1.head().T

Unnamed: 0,0,1,2,3,4
Acedamic percentage in Operating Systems,69,78,71,76,92
percentage in Algorithms,63,62,86,87,62
Percentage in Programming Concepts,78,73,91,60,90
Percentage in Software Engineering,87,60,87,84,67
Percentage in Computer Networks,94,71,61,89,71
Percentage in Electronics Subjects,94,70,81,73,89
Percentage in Computer Architecture,87,73,72,62,73
Percentage in Mathematics,84,84,72,88,71
Percentage in Communication skills,61,91,94,69,73
Hours working per day,9,12,11,7,4


In [9]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 39 columns):
 #   Column                                    Non-Null Count  Dtype
---  ------                                    --------------  -----
 0   Acedamic percentage in Operating Systems  20000 non-null  int64
 1   percentage in Algorithms                  20000 non-null  int64
 2   Percentage in Programming Concepts        20000 non-null  int64
 3   Percentage in Software Engineering        20000 non-null  int64
 4   Percentage in Computer Networks           20000 non-null  int64
 5   Percentage in Electronics Subjects        20000 non-null  int64
 6   Percentage in Computer Architecture       20000 non-null  int64
 7   Percentage in Mathematics                 20000 non-null  int64
 8   Percentage in Communication skills        20000 non-null  int64
 9   Hours working per day                     20000 non-null  int64
 10  Logical quotient rating                   20000 non-null  

**After the transformation, it can be seen that datatype of each feature is int32, so every column is numeric now**

In [10]:
df1['Suggested Job Role'].max()

33

In [11]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

In [12]:
X1 = df1.drop('Suggested Job Role', axis=1)
Y1 = df1['Suggested Job Role']

In [13]:
xtrain, xtest, ytrain, ytest = train_test_split(X1, Y1, test_size=0.2, random_state=21)


In [14]:
# we have 38 features in X 
classifier_model = MLPClassifier(hidden_layer_sizes=(38,76,33), max_iter=320, activation = 'relu',solver='adam',
                                 random_state=21)

In [15]:
classifier_model.fit(xtrain,ytrain)

MLPClassifier(hidden_layer_sizes=(38, 76, 33), max_iter=320, random_state=21)

In [16]:
pred = classifier_model.predict(xtest)

In [17]:
from sklearn.metrics import accuracy_score

In [18]:
accuracy_score(pred, ytest)

0.05025

### `The accuracy came out to be just 5%. Let's try one more time with other parameters`

In [19]:
#predicting with sigmoid activation
classifier_model = MLPClassifier(hidden_layer_sizes=(150,200,100), max_iter=500, activation = 'logistic',solver='adam',
                                 random_state=21)

In [20]:
classifier_model.fit(xtrain, ytrain)



MLPClassifier(activation='logistic', hidden_layer_sizes=(150, 200, 100),
              max_iter=500, random_state=21)

In [21]:
pred = classifier_model.predict(xtest)
accuracy_score(pred, ytest)

0.03925

### `The accuracy still is very low---> 4.85%`

In [22]:
##Let's try with 'tanh' activation 
classifier_model = MLPClassifier(hidden_layer_sizes=(150,200,100), max_iter=500, activation = 'tanh',solver='adam',
                                 random_state=21)
classifier_model.fit(xtrain, ytrain)
pred = classifier_model.predict(xtest)
accuracy_score(pred, ytest)




0.02925

### With 'tanh' activation, the accuracy has also not shown any kind of improvement, rather it has gone down--->2.95%

## `We will merge some of the target classes and will see how it does improve accuracy`
## `The dataset has been loaded into "df'' previously`

In [23]:
df = pd.read_csv('roo_data.csv')
df.head()

Unnamed: 0,Acedamic percentage in Operating Systems,percentage in Algorithms,Percentage in Programming Concepts,Percentage in Software Engineering,Percentage in Computer Networks,Percentage in Electronics Subjects,Percentage in Computer Architecture,Percentage in Mathematics,Percentage in Communication skills,Hours working per day,...,Interested Type of Books,Salary Range Expected,In a Realtionship?,Gentle or Tuff behaviour?,Management or Technical,Salary/work,hard/smart worker,worked in teams ever?,Introvert,Suggested Job Role
0,69,63,78,87,94,94,87,84,61,9,...,Prayer books,salary,no,stubborn,Management,salary,hard worker,yes,no,Database Developer
1,78,62,73,60,71,70,73,84,91,12,...,Childrens,salary,yes,gentle,Technical,salary,hard worker,no,yes,Portal Administrator
2,71,86,91,87,61,81,72,72,94,11,...,Travel,Work,no,stubborn,Management,work,hard worker,no,yes,Portal Administrator
3,76,87,60,84,89,73,62,88,69,7,...,Romance,Work,yes,gentle,Management,work,smart worker,yes,yes,Systems Security Administrator
4,92,62,90,67,71,89,73,71,73,4,...,Cookbooks,salary,no,stubborn,Management,work,hard worker,yes,yes,Business Systems Analyst


In [24]:
df['Suggested Job Role'].value_counts()

Network Security Administrator               1112
Network Security Engineer                     630
Network Engineer                              621
Project Manager                               602
Database Administrator                        593
Portal Administrator                          593
Information Technology Manager                591
Software Engineer                             590
UX Designer                                   589
Design & UX                                   588
Software Developer                            587
CRM Business Analyst                          584
Business Systems Analyst                      582
Database Developer                            581
Solutions Architect                           578
Software Systems Engineer                     575
Software Quality Assurance (QA) / Testing     571
Database Manager                              570
Web Developer                                 570
CRM Technical Developer                       567


In [25]:
# create an empty list to hold values of target label after clubbing
final_label = []
for x in df['Suggested Job Role']:
    if 'Engineer' in x:
        final_label.append('Engineers')
    elif 'Developer' in x:
        final_label.append('Engineers')
    elif 'Analyst' in x:
        final_label.append('Analyst')
    elif 'Manager' in x:
        final_label.append('Manager')
    elif 'Administrator' in x:
        final_label.append('Administrator')
    else:
        final_label.append(x)

In [26]:
df.drop('Suggested Job Role', axis=1, inplace=True)

In [27]:
df['Suggested Job Role'] = final_label

In [28]:
df['Suggested Job Role'].value_counts()

Engineers                                    6367
Analyst                                      3874
Administrator                                2860
Manager                                      1763
UX Designer                                   589
Design & UX                                   588
Solutions Architect                           578
Software Quality Assurance (QA) / Testing     571
Technical Support                             565
Quality Assurance Associate                   565
Data Architect                                564
Information Technology Auditor                558
Technical Services/Help Desk/Tech Support     558
Name: Suggested Job Role, dtype: int64

In [29]:
## again will  club some of the roles together
final_label=[]
for x in df['Suggested Job Role']:
    if 'Design' in x:
        final_label.append('Design')
    elif 'Architect' in x:
        final_label.append('SW Architect')
    elif 'Auditor' in x:
        final_label.append('SW Architect')
    elif 'Quality' in x:
        final_label.append('QA')
    elif 'Support' in x:
        final_label.append('Support')
    else:
        final_label.append(x)

In [30]:
##dropping old suggested job role column
df.drop('Suggested Job Role', axis=1, inplace= True)

In [31]:
df['Suggested Job Role'] = final_label

In [32]:
df['Suggested Job Role'].value_counts()

Engineers        6367
Analyst          3874
Administrator    2860
Manager          1763
SW Architect     1700
Design           1177
QA               1136
Support          1123
Name: Suggested Job Role, dtype: int64

## Now we do have 8 target labels

In [33]:
## We will encode the categorical features
##  we have already stored about nominal features earlier

for feature in feature_nominal:
    encode_feature = LabelEncoder()
    df[feature] = encode_feature.fit_transform(df[feature])

In [34]:
df.head()

Unnamed: 0,Acedamic percentage in Operating Systems,percentage in Algorithms,Percentage in Programming Concepts,Percentage in Software Engineering,Percentage in Computer Networks,Percentage in Electronics Subjects,Percentage in Computer Architecture,Percentage in Mathematics,Percentage in Communication skills,Hours working per day,...,Interested Type of Books,Salary Range Expected,In a Realtionship?,Gentle or Tuff behaviour?,Management or Technical,Salary/work,hard/smart worker,worked in teams ever?,Introvert,Suggested Job Role
0,69,63,78,87,94,94,87,84,61,9,...,21,1,0,1,0,0,0,1,0,3
1,78,62,73,60,71,70,73,84,91,12,...,5,1,1,0,1,0,0,0,1,0
2,71,86,91,87,61,81,72,72,94,11,...,29,0,0,1,0,1,0,0,1,0
3,76,87,60,84,89,73,62,88,69,7,...,23,0,1,0,0,1,1,1,1,0
4,92,62,90,67,71,89,73,71,73,4,...,7,1,0,1,0,1,0,1,1,1


In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 39 columns):
 #   Column                                    Non-Null Count  Dtype
---  ------                                    --------------  -----
 0   Acedamic percentage in Operating Systems  20000 non-null  int64
 1   percentage in Algorithms                  20000 non-null  int64
 2   Percentage in Programming Concepts        20000 non-null  int64
 3   Percentage in Software Engineering        20000 non-null  int64
 4   Percentage in Computer Networks           20000 non-null  int64
 5   Percentage in Electronics Subjects        20000 non-null  int64
 6   Percentage in Computer Architecture       20000 non-null  int64
 7   Percentage in Mathematics                 20000 non-null  int64
 8   Percentage in Communication skills        20000 non-null  int64
 9   Hours working per day                     20000 non-null  int64
 10  Logical quotient rating                   20000 non-null  

In [36]:
X  = df.drop('Suggested Job Role', axis=1)
Y = df['Suggested Job Role']

In [37]:
from sklearn.metrics import confusion_matrix

In [38]:
#split data
xtrain, xtest, ytrain, ytest = train_test_split(X,Y, test_size=0.15, random_state=21)

In [39]:
#let's make some predictions
classifier_model = MLPClassifier(hidden_layer_sizes=(150,200,100), max_iter=300, activation = 'logistic',solver='adam',
                                 random_state=21)
classifier_model.fit(xtrain, ytrain)
pred = classifier_model.predict(xtest)
print(accuracy_score(pred, ytest))
mat = confusion_matrix(ytest,pred)
print(mat)

0.31966666666666665
[[  0   0   0 431   0   0   0   0]
 [  0   0   0 585   0   0   0   0]
 [  0   0   0 163   0   0   0   0]
 [  0   0   0 959   0   0   0   0]
 [  0   0   0 254   0   0   0   0]
 [  0   0   0 176   0   0   0   0]
 [  0   0   0 247   0   0   0   0]
 [  0   0   0 185   0   0   0   0]]


## `The accuracy has improved drastically by reducing the class labels`

In [40]:
##Checking out with ReLu activation function
classifier_model = MLPClassifier(hidden_layer_sizes=(38,76,8), max_iter=300, activation = 'relu',solver='adam',
                                 random_state=21)
classifier_model.fit(xtrain, ytrain)
pred = classifier_model.predict(xtest)
print(accuracy_score(pred, ytest))
mat = confusion_matrix(ytest,pred)
print(mat)



0.31966666666666665
[[  0   0   0 431   0   0   0   0]
 [  0   0   0 585   0   0   0   0]
 [  0   0   0 163   0   0   0   0]
 [  0   0   0 959   0   0   0   0]
 [  0   0   0 254   0   0   0   0]
 [  0   0   0 176   0   0   0   0]
 [  0   0   0 247   0   0   0   0]
 [  0   0   0 185   0   0   0   0]]
