# 2017 Stack Overflow Developer Survey MultiClass Classification (SciKit Learn)

In [1]:
import pandas as pd
import numpy as np
import time
import stack_data
from sklearn.model_selection import train_test_split

SHOW_DISPLAY = False

In [2]:
 # Fetch the data
raw_data = stack_data.get_data()

print(raw_data.shape)
if SHOW_DISPLAY:
    display(raw_data.head())

(51392, 8)


In [3]:
# Remove all rows with no label values
raw_data = raw_data.dropna(subset=[stack_data.LABEL_NAME], how='all')

print(raw_data.shape)
if SHOW_DISPLAY:
    display(raw_data.head())

(36125, 8)


In [4]:
# Iterate all rows and drop ones with MultiLabel, effectively
# turning this into a MultiClass problem.
expanded_data = []
for (idx, row) in raw_data.iterrows():
    # Check for delimiter
    split = [x.strip() for x in row.loc[stack_data.LABEL_NAME].split(';')]
    if len(split) is 1:
        expanded_data.append(row)
        
raw_data = pd.DataFrame(expanded_data).reset_index(drop=True)

print(raw_data.shape)
if SHOW_DISPLAY:
    display(raw_data.head())

(16747, 8)


In [5]:
# Encoding categorical data
# TODO: One-Hot Encode vector columns
from sklearn import preprocessing
def label_encode(df, columns):
    for col in columns:
        le = preprocessing.LabelEncoder()
        col_values_unique = list(df[col].unique())
        le_fitted = le.fit(col_values_unique)
 
        col_values = list(df[col].values)
        le.classes_
        col_values_transformed = le.transform(col_values)
        df[col] = col_values_transformed
 
to_be_encoded_cols = raw_data.columns.values
label_encode(raw_data, to_be_encoded_cols)

print(raw_data.shape)
if SHOW_DISPLAY:
    display(raw_data.head())

(16747, 8)


In [6]:
# Split to train and test data.
# TODO: Consider cross validation
# https://towardsdatascience.com/train-test-split-and-cross-validation-in-python-80b61beca4b6
train, test = train_test_split(raw_data, train_size = 0.8, test_size = 0.2)
# train.to_csv(CLEAN_PATH + '/train_survey_results_public.csv', index=False)
# test.to_csv(CLEAN_PATH + '/test_survey_results_public.csv', index=False)
if SHOW_DISPLAY:
    display(train.head())
    display(test.head())

In [7]:
# 1. Assign the DataFrame's labels (the right-most column) to train_label.
# 2. Delete (pop) the labels from the DataFrame.
# 3. Assign the remainder of the DataFrame to train_features
X_train, Y_train = train, train.pop(stack_data.LABEL_NAME)
X_test, Y_test = test, test.pop(stack_data.LABEL_NAME)

from sklearn import linear_model, neighbors, svm, ensemble, tree, neural_network, naive_bayes

dict_classifiers = {
    "Logistic Regression": linear_model.LogisticRegression()
    "Nearest Neighbors": neighbors.KNeighborsClassifier(),
    "Linear SVM": svm.SVC(),
    "Gradient Boosting Classifier": ensemble.GradientBoostingClassifier(),
    "Decision Tree": tree.DecisionTreeClassifier(),
    "Random Forest": ensemble.RandomForestClassifier(n_estimators = 18),
    "Neural Net": neural_network.MLPClassifier(alpha = 1),
    "Naive Bayes": naive_bayes.GaussianNB()
}

SyntaxError: invalid syntax (<ipython-input-7-ccf72fbe7804>, line 11)

In [None]:
no_classifiers = len(dict_classifiers.keys())

def batch_classify(X_train, Y_train, X_test, Y_test, verbose = True):
    df_results = pd.DataFrame(data=np.zeros(shape=(no_classifiers,4)), columns = ['classifier', 'train_score', 'test_score', 'training_time'])
    count = 0
    for key, classifier in dict_classifiers.items():
        t_start = time.clock()
        classifier.fit(X_train, Y_train)
        t_end = time.clock()
        t_diff = t_end - t_start
        train_score = classifier.score(X_train, Y_train)
        test_score = classifier.score(X_test, Y_test)
        df_results.loc[count,'classifier'] = key
        df_results.loc[count,'train_score'] = train_score
        df_results.loc[count,'test_score'] = test_score
        df_results.loc[count,'training_time'] = t_diff
        if verbose:
            print("trained {c} in {f:.2f} s".format(c=key, f=t_diff))
        count+=1
    return df_results

In [None]:
df_results = batch_classify(X_train, Y_train, X_test, Y_test)
display(df_results.sort_values(by='test_score', ascending=False))