In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [14]:
df = pd.read_csv('dataset/data.csv')
df.columns

Index(['Timestamp', 'How old are you?', 'Industry', 'Job title',
       'Additional context on job title', 'Annual salary',
       'Other monetary comp', 'Currency', 'Currency - other',
       'Additional context on income', 'Country', 'State', 'City',
       'Overall years of professional experience',
       'Years of experience in field', 'Highest level of education completed',
       'Gender', 'Race'],
      dtype='object')

In [15]:
class Clean:
    
    def __init__(self,data):
        self.dataframe = data
        
    def do_work(self):

        self.dataframe = self.dataframe.fillna(-99)
    
        self.dataframe['Timestamp'] = pd.to_datetime(self.dataframe['Timestamp'], errors='coerce')

        self.dataframe = self.dataframe.dropna(subset=['Timestamp'])
        
        self.dataframe.loc[self.dataframe['Annual salary'].str.contains(',') == True, 'Annual salary'] = self.dataframe['Annual salary'].str.split(',').str[0]+''+self.dataframe['Annual salary'].str.split(',').str[1]

       
        remove_digits = ['Additional context on job title', 'City', 'Industry', 'Additional context on job title']

        for i in range(len(remove_digits)):

            self.dataframe[remove_digits[i]] = self.dataframe[remove_digits[i]].str.replace('\d+', '')

        
        convert_to_str = ['Currency - other', 'Additional context on job title', 'Additional context on income',
              'Country', 'State', 'City', 'Overall years of professional experience', 'Years of experience in field',
              'Highest level of education completed', 'Gender', 'Race']

        for i in range(len(convert_to_str)):
    
            self.dataframe[convert_to_str[i]] = self.dataframe[convert_to_str[i]].astype(str)
        
        self.dataframe['Other monetary comp'] = self.dataframe['Other monetary comp'].astype(int)
        
        self.dataframe['Currency - other'] = self.dataframe['Currency - other'].replace({'nan':-99})

        self.dataframe['Currency - other'] = self.dataframe['Currency - other'].replace({'':-99, '0':-99, 'nan': -99, "":-99})

        self.dataframe['Industry'] = self.dataframe['Industry'].replace({'':-99})

        self.dataframe['Additional context on job title'] = self.dataframe['Additional context on job title'].replace({'':-99,"":-99})
        
        self.dataframe['Additional context on job title'] = self.dataframe['Additional context on job title'].replace({'nan':-99})
        
        self.dataframe = self.dataframe.fillna(-99)
        
        return self.dataframe

In [16]:
cleaned_df = Clean(df).do_work()
cleaned_df.head(2)

Unnamed: 0,Timestamp,How old are you?,Industry,Job title,Additional context on job title,Annual salary,Other monetary comp,Currency,Currency - other,Additional context on income,Country,State,City,Overall years of professional experience,Years of experience in field,Highest level of education completed,Gender,Race
0,2021-04-27 11:02:10,25-34,Education (Higher Education),Research and Instruction Librarian,-99,55000,0,USD,-99,-99,United States,Massachusetts,Boston,5-7 years,5-7 years,Master's degree,Woman,White
1,2021-04-27 11:02:22,25-34,Computing or Tech,Change & Internal Communications Manager,-99,54600,4000,GBP,-99,-99,United Kingdom,-99,Cambridge,8 - 10 years,5-7 years,College degree,Non-binary,White


In [17]:
class Labelling:
    
    def __init__(self,data):
        self.dataframe = data
    
    def model(self):
        
        from sklearn.preprocessing import LabelEncoder 
        import joblib
    
        col_1 = LabelEncoder()
        col_2 = LabelEncoder()
        col_3 = LabelEncoder()
        col_4 = LabelEncoder()
        col_5 = LabelEncoder()
        col_6 = LabelEncoder()
        col_7 = LabelEncoder()
        col_8 = LabelEncoder()
        col_9 = LabelEncoder()
        col_10 = LabelEncoder()
        col_11 = LabelEncoder()
        col_12 = LabelEncoder()
        col_13 = LabelEncoder()
        col_14 = LabelEncoder()

        convert_to_str = ['Other monetary comp', 'Industry', 'Currency - other', 'Additional context on job title', 'Additional context on income',
                          'Country', 'State', 'City', 'Overall years of professional experience', 'Years of experience in field',
                          'Highest level of education completed', 'Gender', 'Race']

        for i in range(len(convert_to_str)):
    
            self.dataframe[convert_to_str[i]] = self.dataframe[convert_to_str[i]].astype(str)

        labels = [col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10,col_11,col_12,col_13,col_14]

        le = ['How old are you?', 'Industry', 'Job title', 'Additional context on job title', 'Currency', 
              'Currency - other', 'Additional context on income', 'Country', 'State', 'City',
              'Overall years of professional experience','Years of experience in field',
              'Highest level of education completed', 'Race']

        le_name_mapping = dict(zip(labels, le))

        for i in range(len(le_name_mapping)):

            self.dataframe[list(le_name_mapping.values())[i]] = list(le_name_mapping.keys())[i].fit_transform(self.dataframe[list(le_name_mapping.values())[i]])

        for i in range(len(labels)):

            label_i = 'labels'+'/'+'Label_'+str(i)+'.sav'
            joblib.dump(labels[i], label_i)

        #labelling dependent variable
        main_label = LabelEncoder()
        self.dataframe['Gender'] = main_label.fit_transform(self.dataframe['Gender'])
        
        label_i = 'labels'+'/'+'Label_main.sav'
        joblib.dump(main_label, label_i)
        
        
        self.dataframe = self.dataframe.drop('Timestamp', axis=1)

        self.dataframe = self.dataframe.astype(int)
        
        return self.dataframe

In [18]:
labelled_df = Labelling(cleaned_df).model()
labelled_df.head(2)

Unnamed: 0,How old are you?,Industry,Job title,Additional context on job title,Annual salary,Other monetary comp,Currency,Currency - other,Additional context on income,Country,State,City,Overall years of professional experience,Years of experience in field,Highest level of education completed,Gender,Race
0,1,303,9478,142,55000,0,9,2,58,261,75,397,6,6,3,5,46
1,1,206,1871,142,54600,4000,4,2,58,245,0,552,7,6,1,2,46


In [19]:
#DATA PREPARATION

class Data_preparation:
    
    def __init__(self,data):
        self.dataframe = data
    
    def split(self):

        from imblearn.over_sampling import SMOTE
        from imblearn.under_sampling import RandomUnderSampler
        from sklearn.model_selection import train_test_split
        from sklearn.utils import shuffle
        from sklearn.preprocessing import StandardScaler
        from sklearn.metrics import confusion_matrix
        
        X = np.array(self.dataframe.drop(['Gender'], axis=1))
        y = np.array(self.dataframe['Gender'])

        under = RandomUnderSampler(sampling_strategy = {0: 100, 1: 70, 2: 90, 3: 40})
        X,y = under.fit_resample(X, y)

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        
        return X_train, X_test, y_train, y_test

In [20]:
X_train, X_test, y_train, y_test = Data_preparation(labelled_df).split()

In [21]:
print("X - Training set has {} samples.".format(X_train.shape[0]))
print("X - Testing set has {} samples.".format(X_test.shape[0]),'\n')
print("y - Training set has {} samples.".format(y_train.shape[0]))
print("y - Testing set has {} samples.".format(y_test.shape[0]))

X - Training set has 14712 samples.
X - Testing set has 6306 samples. 

y - Training set has 14712 samples.
y - Testing set has 6306 samples.


In [22]:
class Initial_models:
    
    def __init__(self, X_train, X_test, y_train, y_test):
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
    
    def models(self):
        
        from sklearn import model_selection
        from sklearn.linear_model import LogisticRegression
        from sklearn.tree import DecisionTreeClassifier
        from sklearn.neighbors import KNeighborsClassifier
        from sklearn.neural_network import MLPClassifier
        from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
        from sklearn.naive_bayes import GaussianNB
        from sklearn.svm import SVC
        from sklearn.ensemble import RandomForestClassifier
        
        seed = 7
        models = []
        models.append(('LRM', LogisticRegression()))
        models.append(('LDA', LinearDiscriminantAnalysis()))
        models.append(('KNN', KNeighborsClassifier()))
        models.append(('DTC', DecisionTreeClassifier()))
        models.append(('GNB', GaussianNB()))
        models.append(('SVM', SVC()))
        models.append(('RFC', RandomForestClassifier()))
        models.append(('NNC', MLPClassifier()))
        
        results = []
        names = []

        scoring = 'accuracy'

        import warnings
        warnings.filterwarnings("ignore")

        for name, model in models:
            kfold = model_selection.KFold(n_splits=10)
            cv_results = model_selection.cross_val_score(model, self.X_train, self.y_train, cv=kfold, scoring=scoring)
            results.append(cv_results)
            names.append(name)
            msg = "%s:     %f      (%f)" % (name, cv_results.mean(), cv_results.std())
            print(msg)

In [23]:
Initial_models(X_train, X_test, y_train, y_test).models()

LRM:     0.985046      (0.003480)
LDA:     0.983007      (0.003372)
KNN:     0.985114      (0.003295)
DTC:     0.967442      (0.005840)
GNB:     0.013799      (0.003700)
SVM:     0.985250      (0.003204)
RFC:     0.984910      (0.003450)
NNC:     0.984638      (0.003163)


In [24]:
class Random_forest:
    
    def __init__(self, X_train, X_test, y_train, y_test):
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
    
    def model(self):
        from sklearn.ensemble import RandomForestClassifier

        self.clf = RandomForestClassifier(n_estimators=1000, criterion='entropy', random_state=42)

        self.clf.fit(self.X_train, self.y_train)

        y_pred = self.clf.predict(self.X_test)

        from sklearn.metrics import accuracy_score
        from sklearn.metrics import classification_report
        from sklearn.metrics import confusion_matrix
        from sklearn.metrics import roc_auc_score

        # Evaluate accuracy
        print(confusion_matrix(self.y_test, y_pred))
        print(classification_report(self.y_test, y_pred))
        
        return self.clf

In [25]:
Random_forest(X_train, X_test, y_train, y_test).model()

[[   0    0    0    0   26]
 [   0    0    0    0   21]
 [   0    0    0    0   25]
 [   0    0    0    0   12]
 [   0    0    0    0 6222]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        26
           1       0.00      0.00      0.00        21
           2       0.00      0.00      0.00        25
           3       0.00      0.00      0.00        12
           5       0.99      1.00      0.99      6222

    accuracy                           0.99      6306
   macro avg       0.20      0.20      0.20      6306
weighted avg       0.97      0.99      0.98      6306



RandomForestClassifier(criterion='entropy', n_estimators=1000, random_state=42)

In [26]:
class StackingClassifier:
    
    def __init__(self, X_train, X_test, y_train, y_test):
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
    
    def model(self):
        from sklearn import model_selection
        from sklearn.linear_model import LogisticRegression
        from sklearn.tree import DecisionTreeClassifier
        from sklearn.neighbors import KNeighborsClassifier
        from sklearn.neural_network import MLPClassifier
        from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
        from sklearn.naive_bayes import GaussianNB
        from sklearn.svm import SVC
        from sklearn.ensemble import RandomForestClassifier
        from sklearn.ensemble import StackingClassifier

        estimators = [
                ('RFC', RandomForestClassifier(random_state=42)),
                ('SVM', LogisticRegression()),
                ('DTC', DecisionTreeClassifier()),
                ('KNN', KNeighborsClassifier()),
                ('NNC', MLPClassifier())
        ]

        self.clf = StackingClassifier(estimators=estimators, final_estimator=SVC())

        self.clf.fit(self.X_train, self.y_train).score(self.X_test, self.y_test)

        y_pred = self.clf.predict(self.X_test)

        from sklearn.metrics import accuracy_score
        from sklearn.metrics import classification_report
        from sklearn.metrics import confusion_matrix
        from sklearn.metrics import roc_auc_score

        # Evaluate accuracy
        print(confusion_matrix(self.y_test, y_pred))
        print(classification_report(self.y_test, y_pred))
        
        return self.clf

In [27]:
StackingClassifier(X_train, X_test, y_train, y_test).model()

[[   0    0    0    0   26]
 [   0    0    0    0   21]
 [   0    0    0    0   25]
 [   0    0    0    0   12]
 [   0    0    0    0 6222]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        26
           1       0.00      0.00      0.00        21
           2       0.00      0.00      0.00        25
           3       0.00      0.00      0.00        12
           5       0.99      1.00      0.99      6222

    accuracy                           0.99      6306
   macro avg       0.20      0.20      0.20      6306
weighted avg       0.97      0.99      0.98      6306



StackingClassifier(estimators=[('RFC', RandomForestClassifier(random_state=42)),
                               ('SVM', LogisticRegression()),
                               ('DTC', DecisionTreeClassifier()),
                               ('KNN', KNeighborsClassifier()),
                               ('NNC', MLPClassifier())],
                   final_estimator=SVC())

In [28]:
class OneVsOneClassifier:
    
    def __init__(self, X_train, X_test, y_train, y_test):
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
    
    def model(self):
    
        from sklearn.multiclass import OneVsOneClassifier
        from sklearn.neural_network import MLPClassifier
        from sklearn.metrics import accuracy_score
        from sklearn.metrics import classification_report
        from sklearn.metrics import confusion_matrix
        
        self.clf = OneVsOneClassifier(MLPClassifier(hidden_layer_sizes = [100]*5))
        self.clf.fit(self.X_train, self.y_train)
        dnns_predictions_labels = self.clf.predict(self.X_test)

        # Evaluate accuracy
        print(confusion_matrix(self.y_test, dnns_predictions_labels))
        print(classification_report(self.y_test, dnns_predictions_labels))
        
        return self.clf

In [29]:
OneVsOneClassifier(X_train, X_test, y_train, y_test).model()

[[   0    0    0    0   26]
 [   0    0    0    0   21]
 [   0    0    0    0   25]
 [   0    0    0    0   12]
 [  24   10   17    3 6168]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        26
           1       0.00      0.00      0.00        21
           2       0.00      0.00      0.00        25
           3       0.00      0.00      0.00        12
           5       0.99      0.99      0.99      6222

    accuracy                           0.98      6306
   macro avg       0.20      0.20      0.20      6306
weighted avg       0.97      0.98      0.98      6306



OneVsOneClassifier(estimator=MLPClassifier(hidden_layer_sizes=[100, 100, 100,
                                                               100, 100]))

In [30]:
class OneVsRestClassifier:
    
    def __init__(self, X_train, X_test, y_train, y_test):
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
    
    def model(self):
        
        from sklearn.multiclass import OneVsRestClassifier
        from sklearn.neural_network import MLPClassifier
        from sklearn.metrics import accuracy_score
        from sklearn.metrics import classification_report
        from sklearn.metrics import confusion_matrix
        
        self.clf = OneVsRestClassifier(MLPClassifier(hidden_layer_sizes = [100]*5))
        self.clf.fit(self.X_train, self.y_train)
        dnns_predictions_labels = self.clf.predict(self.X_test)

        # Evaluate accuracy
        print(confusion_matrix(self.y_test, dnns_predictions_labels))
        print(classification_report(self.y_test, dnns_predictions_labels))
        
        return self.clf

In [31]:
OneVsRestClassifier(X_train, X_test, y_train, y_test).model()

[[   0    0    0    0   26]
 [   0    0    0    0   21]
 [   0    0    0    0   25]
 [   0    0    0    0   12]
 [  12    4   11    1 6194]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        26
           1       0.00      0.00      0.00        21
           2       0.00      0.00      0.00        25
           3       0.00      0.00      0.00        12
           5       0.99      1.00      0.99      6222

    accuracy                           0.98      6306
   macro avg       0.20      0.20      0.20      6306
weighted avg       0.97      0.98      0.98      6306



OneVsRestClassifier(estimator=MLPClassifier(hidden_layer_sizes=[100, 100, 100,
                                                                100, 100]))

In [32]:
final_model = StackingClassifier(X_train, X_test, y_train, y_test).model()

[[   0    0    0    0   26]
 [   0    0    0    0   21]
 [   0    0    0    0   25]
 [   0    0    0    0   12]
 [   0    0    0    0 6222]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        26
           1       0.00      0.00      0.00        21
           2       0.00      0.00      0.00        25
           3       0.00      0.00      0.00        12
           5       0.99      1.00      0.99      6222

    accuracy                           0.99      6306
   macro avg       0.20      0.20      0.20      6306
weighted avg       0.97      0.99      0.98      6306



In [33]:
new = labelled_df.drop(['Gender'], axis=1)
newColumns = list(new.columns)

In [34]:
model_columns = newColumns

import joblib

Final_Model = 'models/Final-Model.sav'
joblib.dump(final_model, Final_Model)

Model_columns = 'models/Final-Model-Columns.sav'
joblib.dump(model_columns, Model_columns)

['models/Final-Model-Columns.sav']