In [None]:
import pandas
raw_data = pandas.read_csv('all_enrolments.csv')
raw_data.head()

In [None]:
raw_data = raw_data[raw_data['code_module'] == 'BBB']
raw_data = raw_data[raw_data['code_presentation'] == '2014J']

data = pandas.DataFrame()
data['label'] = raw_data['final_result'].apply(lambda x: 1 if x == 'Pass' or x == 'Distinction' else 0)

features = ['gender', 'highest_education', 'imd_band', 'age_band', 'disability']
for feature in features:
    data = pandas.concat([data, pandas.get_dummies(raw_data[feature], prefix=feature)], axis=1)
data.info()

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import SGDClassifier
X = data.drop('label', axis=1)
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

clf = SGDClassifier(loss="hinge", penalty="l2", max_iter=100)
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

In [None]:
import pickle
with open('model.pkl', 'wb') as f:
    pickle.dump(clf, f, 4)

dummy_map = dict(zip(X.columns, range(X.shape[1])))
with open('map.pkl', 'wb') as f:
    pickle.dump(dummy_map, f, 4)
print(dummy_map)

In [None]:
with open('model.pkl', 'rb') as f:
    clf = pickle.load(f)
with open('map.pkl', 'rb') as f:
    dummy_map = pickle.load(f)

X = [0] * len(dummy_map)
X[dummy_map['gender_M']] = 1
X[dummy_map['highest_education_A Level or Equivalent']] = 1
X[dummy_map['imd_band_0-10%']] = 1
X[dummy_map['age_band_0-35']] = 1
X[dummy_map['disability_N']] = 1
print(clf.predict([X]))

X[dummy_map['highest_education_A Level or Equivalent']] = 0
X[dummy_map['highest_education_Lower Than A Level']] = 1
print(clf.predict([X]))

X[dummy_map['imd_band_0-10%']] = 0
X[dummy_map['imd_band_90-100%']] = 1
print(clf.predict([X]))

X[dummy_map['disability_N']] = 0
X[dummy_map['disability_Y']] = 1
print(clf.predict([X]))