# Clean 2017 Stack Overflow devloper results for multi-class classification


In [1]:
import os
import sys
import zipfile
import pandas as pd
import tensorflow as tf
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

sys.path.insert(0,os.path.join(os.getcwd(), os.pardir,  'src', 'data'))
import stack_data

SHOW_DISPLAY = False

In [2]:
 # Fetch the data
raw_data = stack_data.get_data()

print(raw_data.shape)
if SHOW_DISPLAY:
    display(raw_data.head())

(51392, 8)


In [3]:
# Remove all rows with no label values
raw_data = raw_data.dropna(subset=[stack_data.LABEL_NAME], how='all')

print(raw_data.shape)
if SHOW_DISPLAY:
    display(raw_data.head())

(36125, 8)


In [4]:
# Iterate all rows and drop ones with MultiLabel, effectively
# turning this into a MultiClass problem.
expanded_data = []
for (idx, row) in raw_data.iterrows():
    # Check for delimiter
    split = [x.strip() for x in row.loc[stack_data.LABEL_NAME].split(';')]
    if len(split) is 1:
        expanded_data.append(row)
        
raw_data = pd.DataFrame(expanded_data).reset_index(drop=True)

print(raw_data.shape)
if SHOW_DISPLAY:
    display(raw_data.head())

(16747, 8)


In [5]:
# Encoding categorical data
def label_encode(df, columns):
    for col in columns:
        le = preprocessing.LabelEncoder()
        col_values_unique = list(df[col].unique())
        le_fitted = le.fit(col_values_unique)
 
        col_values = list(df[col].values)
        le.classes_
        col_values_transformed = le.transform(col_values)
        df[col] = col_values_transformed
 
to_be_encoded_cols = raw_data.columns.values
label_encode(raw_data, to_be_encoded_cols)

print(raw_data.shape)
if SHOW_DISPLAY:
    display(raw_data.head())

(16747, 8)


In [6]:
# Split to train and test data.
# TODO: Consider cross validation
# https://towardsdatascience.com/train-test-split-and-cross-validation-in-python-80b61beca4b6
train, test = train_test_split(raw_data, train_size = 0.8, test_size = 0.2)
if SHOW_DISPLAY:
    display(train.head())
    display(test.head())

In [7]:
# 1. Assign the DataFrame's labels (the right-most column) to train_label.
# 2. Delete (pop) the labels from the DataFrame.
# 3. Assign the remainder of the DataFrame to train_features
X_train, Y_train = train, train.pop(stack_data.LABEL_NAME)
X_test, Y_test = test, test.pop(stack_data.LABEL_NAME)

In [8]:
# Create feature columns for all features.
my_feature_columns = []
for key in X_train.keys():
    my_feature_columns.append(tf.feature_column.numeric_column(key=key))