# One Hot Encode TensorFlow (Categorical Vocabulary Column)

In [1]:
import os
import sys
import zipfile
import pandas as pd
import tensorflow as tf
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

sys.path.insert(0,os.path.join(os.getcwd(), os.pardir,  'src', 'data'))
import stack_data

SHOW_DISPLAY = True

In [2]:
 # Fetch the data
df = stack_data.get_data()

print(df.shape)
if SHOW_DISPLAY:
    display(df.head())

(51392, 8)


Unnamed: 0,Professional,ProgramHobby,Country,University,FormalEducation,MajorUndergrad,YearsProgram,DeveloperType
0,Student,"Yes, both",United States,No,Secondary school,,2 to 3 years,
1,Student,"Yes, both",United Kingdom,"Yes, full-time",Some college/university study without earning ...,Computer science or software engineering,9 to 10 years,
2,Professional developer,"Yes, both",United Kingdom,No,Bachelor's degree,Computer science or software engineering,20 or more years,Other
3,Professional non-developer who sometimes write...,"Yes, both",United States,No,Doctoral degree,A non-computer-focused engineering discipline,14 to 15 years,
4,Professional developer,"Yes, I program as a hobby",Switzerland,No,Master's degree,Computer science or software engineering,20 or more years,Mobile developer; Graphics programming; Deskto...


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51392 entries, 0 to 51391
Data columns (total 8 columns):
Professional       51392 non-null object
ProgramHobby       51392 non-null object
Country            51392 non-null object
University         51392 non-null object
FormalEducation    51392 non-null object
MajorUndergrad     42841 non-null object
YearsProgram       51145 non-null object
DeveloperType      36125 non-null object
dtypes: object(8)
memory usage: 3.1+ MB


In [4]:
# drop empty labels
df = df.dropna(subset=[stack_data.LABEL_NAME], how='all')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36125 entries, 2 to 51390
Data columns (total 8 columns):
Professional       36125 non-null object
ProgramHobby       36125 non-null object
Country            36125 non-null object
University         36125 non-null object
FormalEducation    36125 non-null object
MajorUndergrad     32954 non-null object
YearsProgram       35977 non-null object
DeveloperType      36125 non-null object
dtypes: object(8)
memory usage: 2.5+ MB


In [5]:
# Iterate all rows and drop ones with MultiLabel, effectively
# turning this into a MultiClass problem.
# TODO: Build MultiLabel solution
expanded_data = []
for (idx, row) in df.iterrows():
    # Check for delimiter
    split = [x.strip() for x in row.loc[stack_data.LABEL_NAME].split(';')]
    if len(split) is 1:
        expanded_data.append(row)

df = pd.DataFrame(expanded_data).reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16747 entries, 0 to 16746
Data columns (total 8 columns):
Professional       16747 non-null object
ProgramHobby       16747 non-null object
Country            16747 non-null object
University         16747 non-null object
FormalEducation    16747 non-null object
MajorUndergrad     15503 non-null object
YearsProgram       16648 non-null object
DeveloperType      16747 non-null object
dtypes: object(8)
memory usage: 1.0+ MB


In [6]:
# drop empty features
df = df.dropna(subset=['MajorUndergrad'], how='all')
df = df.dropna(subset=['YearsProgram'], how='all')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15425 entries, 0 to 16746
Data columns (total 8 columns):
Professional       15425 non-null object
ProgramHobby       15425 non-null object
Country            15425 non-null object
University         15425 non-null object
FormalEducation    15425 non-null object
MajorUndergrad     15425 non-null object
YearsProgram       15425 non-null object
DeveloperType      15425 non-null object
dtypes: object(8)
memory usage: 1.1+ MB


In [7]:
# TODO: Stripping out nan value from DeveloperType resulted in 1 unique value 
# Consider back-filling nan against student
df.Professional.unique()

array(['Professional developer'], dtype=object)

In [13]:
df.describe(include='all')

Unnamed: 0,Professional,ProgramHobby,Country,University,FormalEducation,MajorUndergrad,YearsProgram,DeveloperType
count,15425,15425,15425,15425,15425,15425,15425,15425
unique,1,4,148,4,7,16,21,14
top,Professional developer,"Yes, I program as a hobby",United States,No,Bachelor's degree,Computer science or software engineering,20 or more years,Web developer
freq,15425,7115,3772,13170,8537,8206,2760,9787
