# Clean 2017 Stack Overflow devloper results for multi-class classification


In [1]:
import os
import sys
import zipfile
import pandas as pd
import tensorflow as tf
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

sys.path.insert(0,os.path.join(os.getcwd(), os.pardir,  'src', 'data'))
import stack_data

SHOW_DISPLAY = True

In [2]:
 # Fetch the data
raw_data = stack_data.get_data()

print(raw_data.shape)
if SHOW_DISPLAY:
    display(raw_data.head())

(51392, 8)


Unnamed: 0,Professional,ProgramHobby,Country,University,FormalEducation,MajorUndergrad,YearsProgram,DeveloperType
0,Student,"Yes, both",United States,No,Secondary school,,2 to 3 years,
1,Student,"Yes, both",United Kingdom,"Yes, full-time",Some college/university study without earning ...,Computer science or software engineering,9 to 10 years,
2,Professional developer,"Yes, both",United Kingdom,No,Bachelor's degree,Computer science or software engineering,20 or more years,Other
3,Professional non-developer who sometimes write...,"Yes, both",United States,No,Doctoral degree,A non-computer-focused engineering discipline,14 to 15 years,
4,Professional developer,"Yes, I program as a hobby",Switzerland,No,Master's degree,Computer science or software engineering,20 or more years,Mobile developer; Graphics programming; Deskto...


In [3]:
# Remove all rows with no label values
raw_data = raw_data.dropna(subset=[stack_data.LABEL_NAME], how='all')

print(raw_data.shape)
if SHOW_DISPLAY:
    display(raw_data.head())

(36125, 8)


Unnamed: 0,Professional,ProgramHobby,Country,University,FormalEducation,MajorUndergrad,YearsProgram,DeveloperType
2,Professional developer,"Yes, both",United Kingdom,No,Bachelor's degree,Computer science or software engineering,20 or more years,Other
4,Professional developer,"Yes, I program as a hobby",Switzerland,No,Master's degree,Computer science or software engineering,20 or more years,Mobile developer; Graphics programming; Deskto...
7,Professional developer,"Yes, both",Poland,No,Master's degree,Computer science or software engineering,10 to 11 years,Web developer
8,Professional developer,"Yes, I program as a hobby",Colombia,"Yes, part-time",Bachelor's degree,Computer science or software engineering,13 to 14 years,Web developer; Mobile developer
9,Professional developer,"Yes, I program as a hobby",France,"Yes, full-time",Master's degree,Computer science or software engineering,13 to 14 years,Mobile developer; Desktop applications developer


In [4]:
# Iterate all rows and drop ones with MultiLabel, effectively
# turning this into a MultiClass problem.
expanded_data = []
for (idx, row) in raw_data.iterrows():
    # Check for delimiter
    split = [x.strip() for x in row.loc[stack_data.LABEL_NAME].split(';')]
    if len(split) is 1:
        expanded_data.append(row)
        
raw_data = pd.DataFrame(expanded_data).reset_index(drop=True)

print(raw_data.shape)
if SHOW_DISPLAY:
    display(raw_data.head())

(16747, 8)


Unnamed: 0,Professional,ProgramHobby,Country,University,FormalEducation,MajorUndergrad,YearsProgram,DeveloperType
0,Professional developer,"Yes, both",United Kingdom,No,Bachelor's degree,Computer science or software engineering,20 or more years,Other
1,Professional developer,"Yes, both",Poland,No,Master's degree,Computer science or software engineering,10 to 11 years,Web developer
2,Professional developer,No,Canada,No,Bachelor's degree,Computer science or software engineering,13 to 14 years,Web developer
3,Professional developer,"Yes, both",Germany,No,Some college/university study without earning ...,Computer science or software engineering,15 to 16 years,Web developer
4,Professional developer,"Yes, I program as a hobby",United Kingdom,No,Professional degree,Computer engineering or electrical/electronics...,20 or more years,Embedded applications/devices developer


In [5]:
# Encoding categorical data
def label_encode(df, columns):
    for col in columns:
        le = preprocessing.LabelEncoder()
        col_values_unique = list(df[col].unique())
        le_fitted = le.fit(col_values_unique)
 
        col_values = list(df[col].values)
        le.classes_
        col_values_transformed = le.transform(col_values)
        df[col] = col_values_transformed
 
to_be_encoded_cols = raw_data.columns.values
label_encode(raw_data, to_be_encoded_cols)

print(raw_data.shape)
if SHOW_DISPLAY:
    display(raw_data.head())

(16747, 8)


Unnamed: 0,Professional,ProgramHobby,Country,University,FormalEducation,MajorUndergrad,YearsProgram,DeveloperType
0,0,3,143,1,0,8,12,10
1,0,3,106,1,4,8,1,13
2,0,0,23,1,0,8,4,13
3,0,3,45,1,8,8,6,13
4,0,2,143,1,6,6,12,5


In [6]:
# Split to train and test data.
# TODO: Consider cross validation
# https://towardsdatascience.com/train-test-split-and-cross-validation-in-python-80b61beca4b6
train, test = train_test_split(raw_data, train_size = 0.8, test_size = 0.2)
if SHOW_DISPLAY:
    display(train.head())
    display(test.head())

Unnamed: 0,Professional,ProgramHobby,Country,University,FormalEducation,MajorUndergrad,YearsProgram,DeveloperType
5514,0,0,41,1,4,6,7,0
6994,0,0,143,3,0,8,11,2
13824,0,3,144,1,4,3,12,13
105,0,2,7,1,0,8,4,13
11376,0,2,107,1,0,8,3,13


Unnamed: 0,Professional,ProgramHobby,Country,University,FormalEducation,MajorUndergrad,YearsProgram,DeveloperType
10280,0,0,58,1,0,8,15,13
12489,0,2,129,1,4,8,12,13
15856,0,2,143,1,8,10,12,13
3963,0,3,45,1,8,9,12,13
14114,0,2,144,1,0,8,12,13


In [7]:
# 1. Assign the DataFrame's labels (the right-most column) to train_label.
# 2. Delete (pop) the labels from the DataFrame.
# 3. Assign the remainder of the DataFrame to train_features
X_train, Y_train = train, train.pop(stack_data.LABEL_NAME)
X_test, Y_test = test, test.pop(stack_data.LABEL_NAME)

In [11]:
# Create feature columns for all features.
my_feature_columns = []
for key in X_train.keys():
    my_feature_columns.append(tf.feature_column.numeric_column(key=key))
    
if SHOW_DISPLAY:
    display(my_feature_columns)

[_NumericColumn(key='Professional', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='ProgramHobby', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='Country', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='University', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='FormalEducation', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='MajorUndergrad', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='YearsProgram', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)]

In [12]:
# Instantiate DNNClassifier
classifier = tf.estimator.DNNClassifier(
        feature_columns=my_feature_columns,
        hidden_units=[10, 10],
        n_classes=14)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\vagrant\\AppData\\Local\\Temp\\tmp2kpis237', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000001F456A92208>, '_task_type': 'worker', '_task_id': 0, '_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [13]:
# Train model
BATCH_SIZE = 100
TRAIN_STEPS = 1000

def train_input_fn(features, labels, batch_size):
    """An input function for training"""
    # Convert the inputs to a Dataset.
    dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))

    # Shuffle, repeat, and batch the examples.
    dataset = dataset.shuffle(1000).repeat().batch(batch_size)

    # Return the dataset.
    return dataset

classifier.train(
    input_fn=lambda:train_input_fn(X_train, Y_train, BATCH_SIZE),
    steps=TRAIN_STEPS)

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into C:\Users\vagrant\AppData\Local\Temp\tmp2kpis237\model.ckpt.
INFO:tensorflow:loss = 1466.5095, step = 1
INFO:tensorflow:global_step/sec: 1024.13
INFO:tensorflow:loss = 125.505356, step = 101 (0.113 sec)
INFO:tensorflow:global_step/sec: 874.889
INFO:tensorflow:loss = 133.00778, step = 201 (0.101 sec)
INFO:tensorflow:global_step/sec: 988.847
INFO:tensorflow:loss = 139.04549, step = 301 (0.107 sec)
INFO:tensorflow:global_step/sec: 968.268
INFO:tensorflow:loss = 130.1328, step = 401 (0.095 sec)
INFO:tensorflow:global_step/sec: 996.324
INFO:tensorflow:loss = 146.99896, step = 501 (0.100 sec)
INFO:tensorflow:global_step/sec: 924.711
INFO:tensorflow:loss = 131.14314, step = 601 (0.111 sec)
INFO:tensorflow:global_step/sec: 1004.62
INFO:tensorflow:loss = 120.48557, step = 701 (0.100 sec)
INFO:tensorflow:global_step/sec: 892.739
INFO:tensorflow:loss = 139.02136, step = 801 (0.112 sec)
INFO:tensorflow:global_

<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x1f456a92080>

In [16]:
 # Evaluate the model.
def eval_input_fn(features, labels, batch_size):
    """An input function for evaluation or prediction"""
    features=dict(features)
    if labels is None:
        # No labels, use only features.
        inputs = features
    else:
        inputs = (features, labels)

    # Convert the inputs to a Dataset.
    dataset = tf.data.Dataset.from_tensor_slices(inputs)

    # Batch the examples
    assert batch_size is not None, "batch_size must not be None"
    dataset = dataset.batch(batch_size)

    # Return the dataset.
    return dataset

    
eval_result = classifier.evaluate(
    input_fn=lambda:eval_input_fn(X_test, Y_test,
                                            BATCH_SIZE))

print('\nTest set accuracy: {accuracy:0.3f}\n'.format(**eval_result))

INFO:tensorflow:Starting evaluation at 2018-02-19-19:41:38
INFO:tensorflow:Restoring parameters from C:\Users\vagrant\AppData\Local\Temp\tmp2kpis237\model.ckpt-1000
INFO:tensorflow:Finished evaluation at 2018-02-19-19:41:39
INFO:tensorflow:Saving dict for global step 1000: accuracy = 0.6391045, average_loss = 1.367791, global_step = 1000, loss = 134.76765

Test set accuracy: 0.639

