# 2017 Stack Overflow Developer Survey MultiClass Classification (SciKit Learn)

In [1]:
import pandas as pd
import numpy as np
import time
import sys
import os
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import linear_model, neighbors, svm, ensemble, tree, neural_network, naive_bayes

sys.path.insert(0,os.path.join(os.getcwd(), os.pardir,  'src', 'data'))
import stack_data

SHOW_DISPLAY = False

In [2]:
 # Fetch the data
raw_data = stack_data.get_data()

print(raw_data.shape)
if SHOW_DISPLAY:
    display(raw_data.head())

(51392, 8)


In [3]:
# Remove all rows with no label values
raw_data = raw_data.dropna(subset=[stack_data.LABEL_NAME], how='all')

print(raw_data.shape)
if SHOW_DISPLAY:
    display(raw_data.head())

(36125, 8)


In [4]:
# Iterate all rows and drop ones with MultiLabel, effectively
# turning this into a MultiClass problem.
expanded_data = []
for (idx, row) in raw_data.iterrows():
    # Check for delimiter
    split = [x.strip() for x in row.loc[stack_data.LABEL_NAME].split(';')]
    if len(split) is 1:
        expanded_data.append(row)
        
raw_data = pd.DataFrame(expanded_data).reset_index(drop=True)

print(raw_data.shape)
if SHOW_DISPLAY:
    display(raw_data.head())

(16747, 8)


In [5]:
# Encoding categorical data
def label_encode(df, columns):
    for col in columns:
        le = preprocessing.LabelEncoder()
        col_values_unique = list(df[col].unique())
        le_fitted = le.fit(col_values_unique)
 
        col_values = list(df[col].values)
        le.classes_
        col_values_transformed = le.transform(col_values)
        df[col] = col_values_transformed
 
to_be_encoded_cols = raw_data.columns.values
label_encode(raw_data, to_be_encoded_cols)

print(raw_data.shape)
if SHOW_DISPLAY:
    display(raw_data.head())

(16747, 8)


In [6]:
# Split to train and test data.
# TODO: Consider cross validation
# https://towardsdatascience.com/train-test-split-and-cross-validation-in-python-80b61beca4b6
train, test = train_test_split(raw_data, train_size = 0.8, test_size = 0.2)
if SHOW_DISPLAY:
    display(train.head())
    display(test.head())

In [7]:
# 1. Assign the DataFrame's labels (the right-most column) to train_label.
# 2. Delete (pop) the labels from the DataFrame.
# 3. Assign the remainder of the DataFrame to train_features
X_train, Y_train = train, train.pop(stack_data.LABEL_NAME)
X_test, Y_test = test, test.pop(stack_data.LABEL_NAME)

In [None]:
# Use to iterate approach
dict_classifiers = {
    "Logistic Regression": linear_model.LogisticRegression(),
    "Nearest Neighbors": neighbors.KNeighborsClassifier(),
    "Linear SVM": svm.SVC(),
    "Gradient Boosting Classifier": ensemble.GradientBoostingClassifier(),
    "Decision Tree": tree.DecisionTreeClassifier(),
    "Random Forest": ensemble.RandomForestClassifier(n_estimators = 18),
    "Neural Net": neural_network.MLPClassifier(alpha = 1),
    "Naive Bayes": naive_bayes.GaussianNB()
}

In [8]:
# Batch process classification function
no_classifiers = len(dict_classifiers.keys())
def batch_classify(X_train, Y_train, X_test, Y_test, verbose = True):
    df_results = pd.DataFrame(data=np.zeros(shape=(no_classifiers,4)), columns = ['classifier', 'train_score', 'test_score', 'training_time'])
    count = 0
    for key, classifier in dict_classifiers.items():
        t_start = time.clock()
        classifier.fit(X_train, Y_train)
        t_end = time.clock()
        t_diff = t_end - t_start
        train_score = classifier.score(X_train, Y_train)
        test_score = classifier.score(X_test, Y_test)
        df_results.loc[count,'classifier'] = key
        df_results.loc[count,'train_score'] = train_score
        df_results.loc[count,'test_score'] = test_score
        df_results.loc[count,'training_time'] = t_diff
        if verbose:
            print("trained {c} in {f:.2f} s".format(c=key, f=t_diff))
        count+=1
    return df_results

In [9]:
# Run classification sequence
df_results = batch_classify(X_train, Y_train, X_test, Y_test)
display(df_results.sort_values(by='test_score', ascending=False))

trained Logistic Regression in 0.47 s
trained Nearest Neighbors in 0.01 s
trained Linear SVM in 21.71 s
trained Gradient Boosting Classifier in 9.00 s
trained Decision Tree in 0.02 s
trained Random Forest in 0.15 s
trained Neural Net in 0.66 s
trained Naive Bayes in 0.01 s


Unnamed: 0,classifier,train_score,test_score,training_time
3,Gradient Boosting Classifier,0.645667,0.643582,8.998906
0,Logistic Regression,0.636784,0.642388,0.473581
6,Neural Net,0.636784,0.642388,0.664642
7,Naive Bayes,0.636784,0.642388,0.005346
2,Linear SVM,0.655296,0.639701,21.705156
5,Random Forest,0.840935,0.568358,0.152223
1,Nearest Neighbors,0.649026,0.564179,0.013094
4,Decision Tree,0.84616,0.483881,0.018688
