# 2017 Stack Overflow Developer Survey MultiClass Classification (SciKit Learn)

In [1]:
import pandas as pd
import numpy as np
import time
import sys
import os
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import linear_model, neighbors, svm, ensemble, tree, neural_network, naive_bayes

sys.path.insert(0,os.path.join(os.getcwd(), os.pardir,  'src', 'data'))
import stack_data

sys.path.insert(0,os.path.join(os.getcwd(), os.pardir,  'src', 'features'))
import encoded

SHOW_DISPLAY = True

In [2]:
 # Fetch the data
raw_data = stack_data.get_data()

print(raw_data.shape)
if SHOW_DISPLAY:
    display(raw_data.head())

(51392, 8)


Unnamed: 0,Professional,ProgramHobby,Country,University,FormalEducation,MajorUndergrad,YearsProgram,DeveloperType
0,Student,"Yes, both",United States,No,Secondary school,,2 to 3 years,
1,Student,"Yes, both",United Kingdom,"Yes, full-time",Some college/university study without earning ...,Computer science or software engineering,9 to 10 years,
2,Professional developer,"Yes, both",United Kingdom,No,Bachelor's degree,Computer science or software engineering,20 or more years,Other
3,Professional non-developer who sometimes write...,"Yes, both",United States,No,Doctoral degree,A non-computer-focused engineering discipline,14 to 15 years,
4,Professional developer,"Yes, I program as a hobby",Switzerland,No,Master's degree,Computer science or software engineering,20 or more years,Mobile developer; Graphics programming; Deskto...


In [3]:
raw_data = encoded.get_encoded(raw_data, stack_data.LABEL_NAME)
print(raw_data.shape)
if SHOW_DISPLAY:
    display(raw_data.head())

(16747, 8)


Unnamed: 0,Professional,ProgramHobby,Country,University,FormalEducation,MajorUndergrad,YearsProgram,DeveloperType
0,0,3,143,1,0,8,12,10
1,0,3,106,1,4,8,1,13
2,0,0,23,1,0,8,4,13
3,0,3,45,1,8,8,6,13
4,0,2,143,1,6,6,12,5


In [4]:
# Split to train and test data.
# TODO: Consider cross validation
# https://towardsdatascience.com/train-test-split-and-cross-validation-in-python-80b61beca4b6
train, test = train_test_split(raw_data, train_size = 0.8, test_size = 0.2)
if SHOW_DISPLAY:
    display(train.head())
    display(test.head())

Unnamed: 0,Professional,ProgramHobby,Country,University,FormalEducation,MajorUndergrad,YearsProgram,DeveloperType
2499,0,3,144,1,0,3,14,13
3263,0,0,45,1,4,8,12,13
5717,0,2,117,1,4,8,1,13
15414,0,2,144,1,0,8,14,9
15484,0,2,144,1,8,8,15,13


Unnamed: 0,Professional,ProgramHobby,Country,University,FormalEducation,MajorUndergrad,YearsProgram,DeveloperType
12425,0,3,25,1,0,8,14,13
9988,0,3,19,3,0,8,8,13
9289,0,2,144,1,0,13,1,0
354,0,2,106,3,0,8,13,10
1544,0,1,61,1,0,8,12,13


In [5]:
# 1. Assign the DataFrame's labels (the right-most column) to train_label.
# 2. Delete (pop) the labels from the DataFrame.
# 3. Assign the remainder of the DataFrame to train_features
X_train, Y_train = train, train.pop(stack_data.LABEL_NAME)
X_test, Y_test = test, test.pop(stack_data.LABEL_NAME)

In [6]:
# Use to iterate approach
dict_classifiers = {
    "Logistic Regression": linear_model.LogisticRegression(),
    "Nearest Neighbors": neighbors.KNeighborsClassifier(),
    "Linear SVM": svm.SVC(),
    "Gradient Boosting Classifier": ensemble.GradientBoostingClassifier(),
    "Decision Tree": tree.DecisionTreeClassifier(),
    "Random Forest": ensemble.RandomForestClassifier(n_estimators = 18),
    "Neural Net": neural_network.MLPClassifier(alpha = 1),
    "Naive Bayes": naive_bayes.GaussianNB()
}

In [7]:
# Batch process classification function
no_classifiers = len(dict_classifiers.keys())
def batch_classify(X_train, Y_train, X_test, Y_test, verbose = True):
    df_results = pd.DataFrame(data=np.zeros(shape=(no_classifiers,4)), columns = ['classifier', 'train_score', 'test_score', 'training_time'])
    count = 0
    for key, classifier in dict_classifiers.items():
        t_start = time.clock()
        classifier.fit(X_train, Y_train)
        t_end = time.clock()
        t_diff = t_end - t_start
        train_score = classifier.score(X_train, Y_train)
        test_score = classifier.score(X_test, Y_test)
        df_results.loc[count,'classifier'] = key
        df_results.loc[count,'train_score'] = train_score
        df_results.loc[count,'test_score'] = test_score
        df_results.loc[count,'training_time'] = t_diff
        if verbose:
            print("trained {c} in {f:.2f} s".format(c=key, f=t_diff))
        count+=1
    return df_results

In [8]:
# Run classification sequence
df_results = batch_classify(X_train, Y_train, X_test, Y_test)
display(df_results.sort_values(by='test_score', ascending=False))

trained Logistic Regression in 0.48 s
trained Nearest Neighbors in 0.01 s
trained Linear SVM in 20.69 s
trained Gradient Boosting Classifier in 8.13 s
trained Decision Tree in 0.02 s
trained Random Forest in 0.13 s
trained Neural Net in 1.15 s
trained Naive Bayes in 0.00 s


Unnamed: 0,classifier,train_score,test_score,training_time
3,Gradient Boosting Classifier,0.645816,0.647164,8.134613
0,Logistic Regression,0.635739,0.646567,0.480853
6,Neural Net,0.635739,0.646567,1.154296
7,Naive Bayes,0.632978,0.643284,0.003534
2,Linear SVM,0.653654,0.642687,20.688854
5,Random Forest,0.841382,0.571642,0.126634
1,Nearest Neighbors,0.649847,0.565075,0.012855
4,Decision Tree,0.845712,0.482388,0.019403
