# Clean 2017 Stack Overflow devloper results for multi-class classification


In [1]:
import os
import sys
import zipfile
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split

ARCHIVE_PATH = 'data/developer_survey_2017.zip'
RESULTS_PATH = 'data/unpacked/survey_results_public.csv'
SCHEMA_PATH = 'data/unpacked/survey_results_schema.csv'
CLEAN_PATH = 'data/cleaned'
CACHE_PATH = 'data/cache'
FEATURE_COLUMNS = [
    'Professional',
    'ProgramHobby',
    'Country',
    'University',
    'FormalEducation',
    'MajorUndergrad',
    'YearsProgram'
]
LABEL_NAME = 'DeveloperType'

In [2]:
# Unpack Archive if not already unpacked
if os.path.isfile(RESULTS_PATH) is not True:
    zip_ref = zipfile.ZipFile(ARCHIVE_PATH, 'r')
    zip_ref.extractall('data/unpacked')
    zip_ref.close()

In [3]:
# Create folder structure
if not os.path.exists(CLEAN_PATH):
    os.makedirs(CLEAN_PATH)

if not os.path.exists(CACHE_PATH):
    os.makedirs(CACHE_PATH)

In [4]:
# Parse the local CSV file.
names = FEATURE_COLUMNS
names.append(LABEL_NAME)
if not os.path.isfile('data/cache/cached_survey_results_public.csv'):
    raw_data = pd.read_csv(
        filepath_or_buffer==RESULTS_PATH,
        header=0,
        low_memory=False
    )
    raw_data = raw_data.loc[:, names]
    raw_data = raw_data.to_csv('data/cache/cached_survey_results_public.csv', index=False)
else:
    raw_data = pd.read_csv('data/cache/cached_survey_results_public.csv', 
                           low_memory=False)

In [5]:
print(raw_data.shape)
display(raw_data)

(51392, 8)


Unnamed: 0,Professional,ProgramHobby,Country,University,FormalEducation,MajorUndergrad,YearsProgram,DeveloperType
0,Student,"Yes, both",United States,No,Secondary school,,2 to 3 years,
1,Student,"Yes, both",United Kingdom,"Yes, full-time",Some college/university study without earning ...,Computer science or software engineering,9 to 10 years,
2,Professional developer,"Yes, both",United Kingdom,No,Bachelor's degree,Computer science or software engineering,20 or more years,Other
3,Professional non-developer who sometimes write...,"Yes, both",United States,No,Doctoral degree,A non-computer-focused engineering discipline,14 to 15 years,
4,Professional developer,"Yes, I program as a hobby",Switzerland,No,Master's degree,Computer science or software engineering,20 or more years,Mobile developer; Graphics programming; Deskto...
5,Student,"Yes, both",New Zealand,"Yes, full-time",Secondary school,,6 to 7 years,
6,Professional non-developer who sometimes write...,"Yes, both",United States,No,Master's degree,A non-computer-focused engineering discipline,9 to 10 years,
7,Professional developer,"Yes, both",Poland,No,Master's degree,Computer science or software engineering,10 to 11 years,Web developer
8,Professional developer,"Yes, I program as a hobby",Colombia,"Yes, part-time",Bachelor's degree,Computer science or software engineering,13 to 14 years,Web developer; Mobile developer
9,Professional developer,"Yes, I program as a hobby",France,"Yes, full-time",Master's degree,Computer science or software engineering,13 to 14 years,Mobile developer; Desktop applications developer


In [6]:
# Remove all rows with no label values
raw_data = raw_data.dropna(subset=[LABEL_NAME], how='all')
print(raw_data.shape)
# display(raw_data)

(36125, 8)


In [7]:
# Exapnd rows that have multi labels into new records
# There is a total possiblity of 14 labels per row
# We are going to strip any multi-label data point and treat this as a
# multi-class classification problem as opposed to a multi-lable classification problem
# TODO: Revisit this and create a multi-lable classification solution
expanded_data = []
for (idx, row) in raw_data.iterrows():
    # Check for delimiter
    split = [x.strip() for x in row.loc[LABEL_NAME].split(';')]
    # Where did we come up with the magic number 5? See the comments above.
    #if len(split) > 1:
        # TODO: Use when we move to a multi-label classification
        # expand deliminated values into uinque rows
        # for label in split:
            # new_row = row.copy()
            # new_row[LABEL_NAME]= label
            # expanded_data.append(new_row)
    if len(split) is 1:
        expanded_data.append(row)
        
raw_data = pd.DataFrame(expanded_data).reset_index(drop=True)
print(raw_data.shape)
# display(raw_data)

(16747, 8)


In [8]:
# One-Hot Encode vector columns
for column in raw_data:
    raw_data[column] = pd.get_dummies(raw_data[column]).values.tolist()

print(raw_data.shape)

(16747, 8)


In [9]:
# Output clean data
clean_path = '/cleaned_survey_results_public.csv'
raw_data.to_csv(CLEAN_PATH + clean_path, index=False)

In [10]:
# Split to train and test data.
# TODO: Consider cross validation
# https://towardsdatascience.com/train-test-split-and-cross-validation-in-python-80b61beca4b6
train, test = train_test_split(raw_data, train_size = 0.8, test_size = 0.2)
train.to_csv(CLEAN_PATH + '/train_survey_results_public.csv', index=False)
test.to_csv(CLEAN_PATH + '/test_survey_results_public.csv', index=False)
print(train.shape)
print(test.shape)

(13397, 8)
(3350, 8)


In [11]:
# So, in regards to ^. We might want to consider using Tensorflow features to shape our data in 
# categorical features instead of One-Hot encoding ahead of time. Currently, I have no clue
# if there is an advantage to this but best to remind myself.
# https://www.tensorflow.org/get_started/feature_columns#feature_columns_1

In [12]:
# 1. Assign the DataFrame's labels (the right-most column) to train_label.
# 2. Delete (pop) the labels from the DataFrame.
# 3. Assign the remainder of the DataFrame to train_features
train_features, train_label = train, train.pop(LABEL_NAME)
test_features, test_label = test, test.pop(LABEL_NAME)

In [13]:
# Create feature columns for all features.
# Now if you look at In [11], you will see some detials on why this step still needs to 
# be explored
my_feature_columns = []
for key in train_features.keys():
    # Will are going to supply vectors and need to figure out length first
    length = len(train_features[key].iloc[0])
    # print(type(train_features[key].iloc[0]))
    # Again, we should be exploring categorical_* features. Espically if we turn this
    # into a true multi-label classification as opposed to a multi-class classification
    my_feature_columns.append(tf.feature_column.numeric_column(key=key, shape=length))
    
display(my_feature_columns)

[_NumericColumn(key='Professional', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='ProgramHobby', shape=(4,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='Country', shape=(153,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='University', shape=(4,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='FormalEducation', shape=(9,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='MajorUndergrad', shape=(16,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='YearsProgram', shape=(21,), default_value=None, dtype=tf.float32, normalizer_fn=None)]

In [14]:
# Create estimator
number_of_classes = len(train_features.iloc[0])
classifier = tf.estimator.DNNClassifier(
        feature_columns=my_feature_columns,
        hidden_units=[10, 10],
        n_classes=number_of_classes)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\vagrant\\AppData\\Local\\Temp\\tmppdu5g44z', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000001B4ADEB2048>, '_task_type': 'worker', '_task_id': 0, '_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [20]:
dataset= tf.data.Dataset.from_tensor_slices(tf.random_uniform([4, 10]))
print(dataset.output_types)
print(dataset.output_shapes)
print(tf.random_uniform([4, 10])
# def train_input_fn(features, labels, batch_size):
#     """An input function for training"""
#     # Convert the inputs to a Dataset.
#     dataset = tf.data.Dataset.from_tensor_slices(dict(features))

#     # Shuffle, repeat, and batch the examples.
#     dataset = dataset.shuffle(1000).repeat().batch(batch_size)

#     # Return the dataset.
#     return dataset

# # Train Model
# BATCH_SIZE = 100 # Typically comes from command line
# TRAIN_STEPS = 1000 # ^
# classifier.train(input_fn=lambda:train_input_fn(train_features, train_label, BATCH_SIZE), steps=TRAIN_STEPS)

SyntaxError: unexpected EOF while parsing (<ipython-input-20-53815f6e440b>, line 19)