# One Hot Encode TensorFlow (Categorical Vocabulary Column)

In [1]:
import os
import sys
import zipfile
import pandas as pd
import tensorflow as tf
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

sys.path.insert(0,os.path.join(os.getcwd(), os.pardir,  'src', 'data'))
import stack_data

SHOW_DISPLAY = True

In [2]:
 # Fetch the data
df = stack_data.get_data()

print(df.shape)
if SHOW_DISPLAY:
    display(df.head())

(51392, 8)


Unnamed: 0,Professional,ProgramHobby,Country,University,FormalEducation,MajorUndergrad,YearsProgram,DeveloperType
0,Student,"Yes, both",United States,No,Secondary school,,2 to 3 years,
1,Student,"Yes, both",United Kingdom,"Yes, full-time",Some college/university study without earning ...,Computer science or software engineering,9 to 10 years,
2,Professional developer,"Yes, both",United Kingdom,No,Bachelor's degree,Computer science or software engineering,20 or more years,Other
3,Professional non-developer who sometimes write...,"Yes, both",United States,No,Doctoral degree,A non-computer-focused engineering discipline,14 to 15 years,
4,Professional developer,"Yes, I program as a hobby",Switzerland,No,Master's degree,Computer science or software engineering,20 or more years,Mobile developer; Graphics programming; Deskto...


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51392 entries, 0 to 51391
Data columns (total 8 columns):
Professional       51392 non-null object
ProgramHobby       51392 non-null object
Country            51392 non-null object
University         51392 non-null object
FormalEducation    51392 non-null object
MajorUndergrad     42841 non-null object
YearsProgram       51145 non-null object
DeveloperType      36125 non-null object
dtypes: object(8)
memory usage: 3.1+ MB


In [4]:
# drop empty labels
df = df.dropna(subset=[stack_data.LABEL_NAME], how='all')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36125 entries, 2 to 51390
Data columns (total 8 columns):
Professional       36125 non-null object
ProgramHobby       36125 non-null object
Country            36125 non-null object
University         36125 non-null object
FormalEducation    36125 non-null object
MajorUndergrad     32954 non-null object
YearsProgram       35977 non-null object
DeveloperType      36125 non-null object
dtypes: object(8)
memory usage: 2.5+ MB


In [5]:
# Iterate all rows and drop ones with MultiLabel, effectively
# turning this into a MultiClass problem.
# TODO: Build MultiLabel solution
expanded_data = []
for (idx, row) in df.iterrows():
    # Check for delimiter
    split = [x.strip() for x in row.loc[stack_data.LABEL_NAME].split(';')]
    if len(split) is 1:
        expanded_data.append(row)

df = pd.DataFrame(expanded_data).reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16747 entries, 0 to 16746
Data columns (total 8 columns):
Professional       16747 non-null object
ProgramHobby       16747 non-null object
Country            16747 non-null object
University         16747 non-null object
FormalEducation    16747 non-null object
MajorUndergrad     15503 non-null object
YearsProgram       16648 non-null object
DeveloperType      16747 non-null object
dtypes: object(8)
memory usage: 1.0+ MB


In [6]:
# drop empty features
df = df.dropna(subset=['MajorUndergrad'], how='all')
df = df.dropna(subset=['YearsProgram'], how='all')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15425 entries, 0 to 16746
Data columns (total 8 columns):
Professional       15425 non-null object
ProgramHobby       15425 non-null object
Country            15425 non-null object
University         15425 non-null object
FormalEducation    15425 non-null object
MajorUndergrad     15425 non-null object
YearsProgram       15425 non-null object
DeveloperType      15425 non-null object
dtypes: object(8)
memory usage: 1.1+ MB


In [7]:
# TODO: Stripping out nan value from DeveloperType resulted in 1 unique value 
# Consider back-filling nan against student
df.Professional.unique()

array(['Professional developer'], dtype=object)

In [8]:
df.describe(include='all')

Unnamed: 0,Professional,ProgramHobby,Country,University,FormalEducation,MajorUndergrad,YearsProgram,DeveloperType
count,15425,15425,15425,15425,15425,15425,15425,15425
unique,1,4,148,4,7,16,21,14
top,Professional developer,"Yes, I program as a hobby",United States,No,Bachelor's degree,Computer science or software engineering,20 or more years,Web developer
freq,15425,7115,3772,13170,8537,8206,2760,9787


In [9]:
# Abstract categoreis from feature/label
# write_feature_category_scirpts(feature):
#    file = open("../data/feature_categories/" + feature + ".txt", "w")
#    for unique in df[feature].unique():
#        file.write(unique + '\n')
        
#    file.close()    
    
# for feature in stack_data.get_features_and_columns():
#    write_feature_category_scirpts(feature)

In [10]:
# Create feature columns for all features.
feature_columns = []
for feature in stack_data.FEATURE_COLUMNS:
    feature_column = tf.feature_column.categorical_column_with_vocabulary_list(
            key=feature,
            vocabulary_list=df[feature].unique())
    feature_columns.append(feature_column)

In [11]:
# Instantiate classifer
class_length = len(df[feature].unique())
classifier = tf.estimator.DNNClassifier(
        feature_columns=feature_columns,
        hidden_units=[10, 10],
        n_classes=class_length)

14
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\vagrant\\AppData\\Local\\Temp\\tmpmumdl1pv', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000001DFFD76BEB8>, '_task_type': 'worker', '_task_id': 0, '_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [12]:
# Split to train and test data.
# TODO: Consider cross validation
# https://towardsdatascience.com/train-test-split-and-cross-validation-in-python-80b61beca4b6
train, test = train_test_split(df, train_size = 0.8, test_size = 0.2)
if SHOW_DISPLAY:
    display(train.head())
    display(test.head())

Unnamed: 0,Professional,ProgramHobby,Country,University,FormalEducation,MajorUndergrad,YearsProgram,DeveloperType
11700,Professional developer,No,United States,No,Master's degree,Computer science or software engineering,20 or more years,Web developer
10526,Professional developer,"Yes, both",United Kingdom,No,Bachelor's degree,Computer science or software engineering,20 or more years,Web developer
5936,Professional developer,"Yes, both",Canada,No,Master's degree,Computer science or software engineering,10 to 11 years,Web developer
14382,Professional developer,"Yes, I program as a hobby",United States,No,Bachelor's degree,Computer science or software engineering,6 to 7 years,Desktop applications developer
12960,Professional developer,"Yes, I program as a hobby",United States,No,Bachelor's degree,Fine arts or performing arts,3 to 4 years,Web developer


Unnamed: 0,Professional,ProgramHobby,Country,University,FormalEducation,MajorUndergrad,YearsProgram,DeveloperType
10332,Professional developer,No,France,No,Master's degree,Computer science or software engineering,8 to 9 years,Web developer
10357,Professional developer,"Yes, I program as a hobby",Germany,No,Master's degree,Computer science or software engineering,10 to 11 years,Web developer
6594,Professional developer,No,United States,No,Bachelor's degree,A humanities discipline,4 to 5 years,Web developer
3930,Professional developer,"Yes, I program as a hobby",Slovak Republic,No,Master's degree,Computer science or software engineering,11 to 12 years,Web developer
12603,Professional developer,"Yes, I program as a hobby",Canada,No,Master's degree,Computer science or software engineering,18 to 19 years,Web developer


In [13]:
# 1. Assign the DataFrame's labels (the right-most column) to train_label.
# 2. Delete (pop) the labels from the DataFrame.
# 3. Assign the remainder of the DataFrame to train_features
X_train, Y_train = train, train.pop(stack_data.LABEL_NAME)
X_test, Y_test = test, test.pop(stack_data.LABEL_NAME)

In [None]:
# Train model
BATCH_SIZE = 100
TRAIN_STEPS = 1000

def train_input_fn(features, labels, batch_size):
    """An input function for training"""
    # Convert the inputs to a Dataset.
    dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))

    # Shuffle, repeat, and batch the examples.
    dataset = dataset.shuffle(1000).repeat().batch(batch_size)

    # Return the dataset.
    return dataset

classifier.train(
    input_fn=lambda:train_input_fn(X_train, Y_train, BATCH_SIZE),
    steps=TRAIN_STEPS)