In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (confusion_matrix, log_loss, make_scorer,
                             roc_auc_score, roc_curve)
from sklearn.model_selection import learning_curve, train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
# Read in survey data.
df = pd.read_csv('survey_results_public.csv')

In [None]:
# Filter out all non-students.
students = df[df['Employment'].str.contains('Student', na=False)]

In [None]:
# Drop features with mostly missing values.
nalimit = len(students) * 0.55
students = students.dropna(axis='columns', thresh=nalimit)
del students['ResponseId']  # Don't need response ID.

In [None]:
# Sample a portion of the data.
students = students.sample(2000, random_state=42, ignore_index=True)

In [None]:
# Explore the domain of each feature (all unique values).
for feature in students:
    responses = set()

    # Survey responses are semicolon-separated, so they must be unraveled.
    for response in students[feature].dropna():
        responses |= set(str(response).split(';'))

    print(f'{feature} ({len(responses)}): {"; ".join(sorted(responses))}\n')

In [None]:
# Select data that can either be counted or categorized into features.
countable_data = ['LanguageHaveWorkedWith', 'DatabaseHaveWorkedWith',
                  'PlatformHaveWorkedWith', 'WebframeHaveWorkedWith',
                  'MiscTechHaveWorkedWith', 'ToolsTechHaveWorkedWith']
categorical_data = ['MainBranch', 'EdLevel', 'Country', 'Age', 'Ethnicity']

# Isolate the IDEs column; this will be used in the output vector.
output = 'NEWCollabToolsHaveWorkedWith'

# Generate features by counting the number of responses to a question.
df1 = students[countable_data].applymap(lambda x: x.count(';') + 1,
                                        na_action='ignore')

# Generate features by encoding distinct survey responses as digits.
df2 = students[categorical_data].apply(lambda x: x.factorize()[0])

# Generate boolean features based on employment status
# and usage of version control systems (i.e., Git).
s1 = students['Employment'].str.contains('Employed')
s2 = students['VersionControlSystem'].map(lambda x: x != "I don't use one",
                                          na_action='ignore')

# Generate additional numeric features based on the number of operating
# systems used by the respondent and how many years they have coded.
s3 = students['OpSysPersonal use'] + ';' + students['OpSysProfessional use']
s3 = s3.map(lambda x: len(set(x.split(';'))), na_action='ignore').fillna(1)
s3.rename('OperatingSystems', inplace=True)
s4 = students['YearsCode'].replace('Less than 1 year', 0.5)
s4 = s4.replace('More than 50 years', 50.5)

In [None]:
# Combine each generated feature into the design matrix.
X = pd.concat([df1, df2, s1, s2, s3, s4], axis=1).fillna(0).astype(float)

# Classify respondents by their usage of VS Code in the output vector.
y = students[output].str.contains('Visual Studio Code').fillna(0).astype(int)

# Split data into training (60%), test (20%), and validation (20%) sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.6,
                                                    random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5,
                                                random_state=42)

In [None]:
# Chain feature scaling and logistic regression into a single pipeline.
# This is to ensure no bias occurs from scaling all data prematurely.
pipeline = make_pipeline(StandardScaler(), LogisticRegression(penalty='none'))

# Generate a learning curve by cross-validating the training
# and test data across several different training sizes.
train_sizes, train_scores, test_scores = learning_curve(
    estimator=pipeline, X=X_train, y=y_train,
    train_sizes=np.linspace(0.1, 1.0, 10),
    scoring=make_scorer(log_loss, needs_proba=True))
train_mean = np.mean(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)

# Plot the learning curve.
plt.plot(train_sizes, train_mean, color='#b45f06', label='Training Error')
plt.plot(train_sizes, test_mean, color='#6aa84f', label='Validation Error')
plt.title('Learning Curve')
plt.xlabel('Training Examples')
plt.ylabel('Cost')
plt.grid()
plt.legend()
plt.show()

In [None]:
# Apply feature scaling to the training data.
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

# Normalize the test/validation data using the same scaling.
X_test = scaler.transform(X_test)
X_val = scaler.transform(X_val)

# Fit logistic regression model to the training set.
logmodel = LogisticRegression(penalty='none').fit(X_train, y_train)

In [None]:
def percent_positive(data: pd.Series):
    return f'{data.value_counts(normalize=True)[1]:.2%}'


# Display the proportion of VS Code users in each dataset.
print(f'VS Code Users (Training):\t{percent_positive(y_train)}')
print(f'VS Code Users (Testing):\t{percent_positive(y_test)}')
print(f'VS Code Users (Validation):\t{percent_positive(y_val)}')

In [None]:
# Initial accuracy test.
y_pred = logmodel.predict(X_test)
print(f'Accuracy = {logmodel.score(X_test, y_test)}')

In [None]:
# Compute the ROC using the confidence of each prediction.
y_proba = logmodel.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_proba)

# Plot the ROC curve. The AUC is listed in the legend.
plt.plot(fpr, tpr, color='darkorange',
         label=f'ROC Curve (AUC = {roc_auc_score(y_test, y_proba):.4})')
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.title('Receiver Operating Characteristic')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.xlim([0.0, 1.01])
plt.ylim([0.0, 1.01])
plt.legend(loc='lower right')
plt.show()

In [None]:
# Calculate precision and recall score.
cm = confusion_matrix(y_test, y_pred)
TN, FP, FN, TP = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
print(f'Precision = {TP / (TP + FP)}')
print(f'Recall = {TP / (TP + FN)}')