In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from imblearn.under_sampling import RandomUnderSampler
from IPython.display import display
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (confusion_matrix, log_loss, make_scorer,
                             roc_auc_score, roc_curve)
from sklearn.model_selection import learning_curve, train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier

In [None]:
# Read in survey data.
df = pd.read_csv('survey_results_public.csv')

In [None]:
# Filter out all non-students.
students = df[df['Employment'].str.contains('Student', na=False)]

In [None]:
# Drop features with mostly missing values.
nalimit = len(students) * 0.55
students = students.dropna(axis='columns', thresh=nalimit)

# Drop survey-related features.
del students['ResponseId']
students = students.loc[:, ~students.columns.str.contains("SO|Survey")]

# Drop highly correlated features.
students = students.loc[:, ~students.columns.str.endswith("WantToWorkWith")]

# Reset dataframe indices.
students = students.reset_index(drop=True)

In [None]:
# Explore the domain of each feature in the dataset.
def explore_domain(data: pd.DataFrame):
    for feature in data:
        responses = set()

        # Many survey questions have multi-answer responses. Split them
        # to count the number of unique responses to each question.
        for response in data[feature]:
            responses |= response_to_set(response)

        # Display the number of possible responses, how many missing
        # responses there are, and list each possible response.
        print(f'{feature} (Count: {len(responses)}, Null: '
              f'{data[feature].isna().sum()}): '
              f'{"; ".join(sorted(responses))}\n')


# Converts a multi-answer survey response into a set of sub-responses.
def response_to_set(response: str):
    return set(response.split(';')) if pd.notna(response) else set()


# Counts the number of sub-responses given to a multi-answer question.
def count_responses(response: str):
    return response.count(';') + 1


# Prints the percentage of class 1 items in the output vector.
def percent_positive(y: pd.Series):
    print(f'{y.value_counts(normalize=True)[1]:.2%}')

explore_domain(students)

In [None]:
# Select data that can either be counted or categorized into features.
countable_data = ['LanguageHaveWorkedWith', 'DatabaseHaveWorkedWith',
                  'PlatformHaveWorkedWith', 'WebframeHaveWorkedWith',
                  'MiscTechHaveWorkedWith', 'ToolsTechHaveWorkedWith']
categorical_data = ['MainBranch', 'EdLevel', 'Age']
encoding = [
    {
        'I am learning to code': 1,
        'I code primarily as a hobby': 2,
        'I am not primarily a developer, but I write'
        'code sometimes as part of my work': 3,
        'I used to be a developer by profession, but no longer am': 4,
        'I am a developer by profession': 5
    },
    {
        'Something else': 0,
        'Primary/elementary school': 1,
        'Secondary school (e.g. American high school,'
        'German Realschule or Gymnasium, etc.)': 2,
        'Some college/university study without earning a degree': 3,
        'Associate degree (A.A., A.S., etc.)': 4,
        'Professional degree (JD, MD, etc.)': 5,
        'Bachelor’s degree (B.A., B.S., B.Eng., etc.)': 6,
        'Master’s degree (M.A., M.S., M.Eng., MBA, etc.)': 7,
        'Other doctoral degree (Ph.D., Ed.D., etc.)': 8
    },
    {
        'Prefer not to say': 0,
        'Under 18 years old': 1,
        '18-24 years old': 2,
        '25-34 years old': 3,
        '35-44 years old': 4,
        '45-54 years old': 5,
        '55-64 years old': 6,
        '65 years or older': 7
    }
]
os_data = ['OpSysPersonal use', 'OpSysProfessional use']

# Isolate the IDEs column; this will be used in the output vector.
output = 'NEWCollabToolsHaveWorkedWith'

# Generate features by counting the number of responses to a question.
data = students[countable_data].applymap(count_responses, na_action='ignore')
data.columns = data.columns.map(lambda x: x.removesuffix('HaveWorkedWith'))

# Generate features by encoding distinct survey responses as digits.
for i, feature in enumerate(categorical_data):
    data[feature] = students[feature].map(encoding[i])

# Generate boolean features based on employment status, whether the
# respondent uses WSL, and use of version control systems (i.e., Git).
data['Employed'] = students['Employment'].str.contains('Employed')

os = students[os_data].applymap(response_to_set)
os = os.apply(lambda x: set.union(*x), axis=1)
data['WSL'] = os.map({'Windows Subsystem for Linux (WSL)'}.issubset)

data['VCS'] = students['VersionControlSystem'].fillna("I don't use one")
data['VCS'] = data['VCS'].map(lambda x: x != "I don't use one")

# Increase feature based on whether the VCS was used in an editor/IDE.
data['VCS'] += students['VCInteraction'].str.contains('Code editor', na=0)

# Generate additional numeric features based on the number of operating
# systems used by the respondent and how many years they have coded.
data['OS'] = os.map(len)
data['YearsCode'] = students['YearsCode'].replace('Less than 1 year', 1)
data['YearsCode'] = data['YearsCode'].replace('More than 50 years', 50)

# Classify respondents by whether they use VS Code.
data['VSCode'] = students[output].str.contains('Visual Studio Code', na=0)

# Clean missing values and convert boolean features to floats.
data = data.fillna(0).astype(int)

# Generate the design matrix and output vector.
X = data.drop('VSCode', axis=1)
y = data['VSCode']

In [None]:
# Plots a stacked bar graph showing the proportion of VS Code
# users to non-VS Code users in the given feature category.
def plot_vscode_by_category(df: pd.DataFrame, x: str):
    # Use confusion matrix to get VS Code users by category.
    cm = confusion_matrix(df[x], df['VSCode'])
    non_vscode_users = cm[:, 0]
    vscode_users = cm[:, 1]

    # Compute proportions of VS Code vs. non-VS Code.
    total = non_vscode_users + vscode_users
    prop_non_vscode = np.true_divide(non_vscode_users, total) * 100
    prop_vscode = np.true_divide(vscode_users, total) * 100

    # Get positions of bars on the x-axis.
    x_axis = df[x].unique()
    x_axis = x_axis if len(x_axis) > 2 else x_axis.astype(bool)
    r = range(len(x_axis))

    # Plot top and bottom bars.
    barWidth = 0.8
    plt.bar(r, prop_vscode, bottom=prop_non_vscode, color='#00bfff',
            edgecolor='white', width=barWidth, label="Uses VS Code")
    plt.bar(r, prop_non_vscode, color='#ff4000', edgecolor='white',
            width=barWidth, label="Doesn't Use VS Code")

    # Plot category ticks, axis labels, and legend.
    plt.xticks(r, x_axis)
    plt.xlabel(x, fontweight='bold')
    plt.ylabel("Survey Respondents (%)", fontweight='bold')
    plt.legend()
    plt.show()


plot_vscode_by_category(data, 'WSL')


In [None]:
# Display the importance of each feature, in descending order.
def show_feature_importance(X: pd.DataFrame, y: pd.Series):
    # Run a tree-based classifier to determine feature importance.
    dt = DecisionTreeClassifier(random_state=42, criterion='log_loss')
    dt.fit(X, y)

    # Compose features/importances in a dataframe and display said data.
    fi_df = pd.DataFrame(zip(X.columns, dt.feature_importances_),
                         columns=['Feature', 'Importance'])
    fi_df.sort_values('Importance', ascending=False, inplace=True)
    display(fi_df.reset_index(drop=True))


show_feature_importance(X, y)

In [None]:
# Plot a pie chart describing the proportion of VS Code users.
def plot_vscode_users(y: pd.Series):
    colors = colors = ['#56a5d8', '#7b6ca7']
    plt.pie(y.value_counts(), colors=colors, autopct='%.2f%%', startangle=90)
    plt.title('Students Who Use Visual Studio Code')
    plt.legend(ncol=2, labels=['Uses VS Code', "Doesn't Use VS Code"],
               bbox_to_anchor=(1, 0), loc='best')
    plt.subplots_adjust(top=0.85)
    plt.show()


# By default, the dataset is very imbalanced.
# Over 80% of survey respondents use VS Code.
plot_vscode_users(y)

In [None]:
# To remedy the unevenness of the dataset, we can use undersampling
# to reduce the number of training examples with VS Code users.
undersample = True

if undersample:
    X, y = RandomUnderSampler(random_state=42).fit_resample(X, y)
    plot_vscode_users(y)  # The new data is now perfectly balanced.

In [None]:
# Split data into training (60%), test (20%), and validation (20%) sets.
def split_data(X, y, undersample=False):
    stratify = y if undersample else None
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.6,
                                                        random_state=42,
                                                        stratify=stratify)
    stratify = y_test if undersample else None
    X_test, X_val, y_test, y_val = train_test_split(X_test, y_test,
                                                    test_size=0.5,
                                                    random_state=42,
                                                    stratify=stratify)

    return X_train, X_test, X_val, y_train, y_test, y_val


X_train, X_test, X_val, y_train, y_test, y_val = split_data(X, y, undersample)

In [None]:
# Chain feature scaling and logistic regression into a single pipeline.
# This is to ensure no bias occurs from scaling all data prematurely.
pipeline = make_pipeline(StandardScaler(), LogisticRegression(penalty='none'))

# Generate a learning curve by cross-validating the training
# and test data across several different training sizes.
train_sizes, train_scores, test_scores = learning_curve(
    estimator=pipeline, X=X_train, y=y_train,
    train_sizes=np.linspace(0.1, 1.0, 10),
    scoring=make_scorer(log_loss, needs_proba=True))
train_mean = np.mean(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)

# Plot the learning curve.
plt.plot(train_sizes, train_mean, color='#b45f06', label='Training Error')
plt.plot(train_sizes, test_mean, color='#6aa84f', label='Validation Error')
plt.title('Learning Curve')
plt.xlabel('Training Examples')
plt.ylabel('Cost')
plt.grid()
plt.legend()
plt.show()

In [None]:
# Apply feature scaling to the training data.
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

# Normalize the test/validation data using the same scaling.
X_test = scaler.transform(X_test)
X_val = scaler.transform(X_val)

# Fit logistic regression model to the training set.
logmodel = LogisticRegression(penalty='none').fit(X_train, y_train)

In [None]:
# Compute the ROC using the confidence of each prediction.
y_proba = logmodel.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_proba)

# Plot the ROC curve. The AUC is listed in the legend.
plt.plot(fpr, tpr, color='darkorange',
         label=f'ROC Curve (AUC = {roc_auc_score(y_test, y_proba):.4})')
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.title('Receiver Operating Characteristic')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.xlim([0.0, 1.01])
plt.ylim([0.0, 1.01])
plt.legend(loc='lower right')
plt.show()

In [None]:
# Initial accuracy test.
y_pred = logmodel.predict(X_test)
print(f'Accuracy = {logmodel.score(X_test, y_test)}')

In [None]:
# Calculate precision and recall score.
cm = confusion_matrix(y_test, y_pred)
TN, FP, FN, TP = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
print(f'Precision = {TP / (TP + FP)}')
print(f'Recall = {TP / (TP + FN)}')