In [None]:
import pandas as pd

# Read in survey data.
df = pd.read_csv("survey_results_public.csv")

In [None]:
# Filter out all non-students.
students = df[df['Employment'].str.contains('Student', na=False)]

In [None]:
# Select data that can either be counted or categorized into features.
countable_data = ['LanguageHaveWorkedWith', 'DatabaseHaveWorkedWith',
                  'PlatformHaveWorkedWith', 'WebframeHaveWorkedWith',
                  'MiscTechHaveWorkedWith', 'ToolsTechHaveWorkedWith']
categorizable_data = ['MainBranch', 'EdLevel', 'Country', 'Age', 'Ethnicity']

# Isolate the IDEs column; this will be used in the output vector.
output = 'NEWCollabToolsHaveWorkedWith'

# Generate features by counting the number of responses to a question.
df1 = students[countable_data].applymap(lambda x: x.count(';') + 1,
                                        na_action='ignore')

# Generate features by encoding distinct survey responses as digits.
df2 = students[categorizable_data].apply(lambda x: x.factorize()[0])

# Generate boolean features based on employment status
# and usage of version control systems (i.e., Git).
s1 = students['Employment'].str.contains('Employed')
s2 = students['VersionControlSystem'].map(lambda x: x != "I don't use one",
                                          na_action='ignore')

# Generate additional numeric features based on the number of operating
# systems used by the respondent and how many years they have coded.
s3 = students['OpSysPersonal use'] + ';' + students['OpSysProfessional use']
s3 = s3.map(lambda x: len(set(x.split(';'))), na_action='ignore')
s4 = students['YearsCode'].replace('Less than 1 year', 0.5)
s4 = s4.replace('More than 50 years', 50.5)

In [None]:
# Combine each generated feature into the design matrix.
X = pd.concat([df1, df2, s1, s2, s3, s4], axis=1).fillna(0).astype(float)

# Classify respondents by their usage of VS Code in the output vector.
y = students[output].str.contains('Visual Studio Code').fillna(0).astype(int)