## Model to Predict New Users

In [43]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import MVP_Davila, MVP_Ortiz, MVP_Shi

from sklearn.preprocessing import MinMaxScaler

### Acuqire the train, validate and test used for MVP

In [44]:
# Acquire the train, validate and test

df_train = pd.read_csv('train.csv')
df_validate = pd.read_csv('validate.csv')
df_test = pd.read_csv('test.csv')

df_train.shape, df_validate.shape, df_test.shape

((411517, 18), (50842, 18), (52868, 18))

### Prepare to concat the train, validate and test

In [45]:
# Drop the columns merged from questions.csv and lectures.csv

cols = ['lecture_id', 'tag', 'lecture_part', 'type_of', 'question_id',
        'bundle_id', 'correct_answer', 'question_part', 'tags']

df_train = df_train.drop(columns = cols)
df_validate = df_validate.drop(columns = cols)
df_test = df_test.drop(columns = cols)

# Print the shapes
df_train.shape, df_validate.shape, df_test.shape

((411517, 9), (50842, 9), (52868, 9))

In [46]:
# Take a look at the first user in train
df_train.head(1)

Unnamed: 0,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
0,0,1864702,5720,0,0,1,1,,


In [47]:
# Count the records of the first user in train
(df_train.user_id == 1864702).sum()

3441

In [48]:
# Count the records of the first user in validate
(df_validate.user_id == 1864702).sum()

430

In [50]:
# Count the records of the first user in train
(df_test.user_id == 1864702).sum()

431

### Conat the train, validate and test

In [51]:
# Concat train, validate and test
df = pd.concat([df_train, df_validate, df_test], ignore_index=True)

# Print the shapes
df.shape

(515227, 9)

In [52]:
# Sort the dataframe by "user_id" and "timestamp"

df = df.sort_values(["user_id", "timestamp"], ignore_index=True)
df.head(4305)

Unnamed: 0,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
0,0,1864702,5720,0,0,1,1,,
1,45951,1864702,5204,0,1,1,0,inf,False
2,74342,1864702,4094,0,2,1,1,43000.0,False
3,96778,1864702,9699,0,3,0,1,25000.0,False
4,132969,1864702,5889,0,4,2,0,19000.0,False
...,...,...,...,...,...,...,...,...,...
4300,23751096431,1864702,8326,0,2819,0,1,36000.0,True
4301,23928102914,1864702,8884,0,2820,0,1,4000.0,True
4302,0,1960671,4229,0,0,3,1,,
4303,29753,1960671,744,0,1,0,0,20000.0,False


### Split the data into old and new users

In [53]:
# Split to older and new users

train, validate, test = MVP_Ortiz.train_validate_test(df, sampled=False)
train.shape, validate.shape, test.shape

((411274, 9), (56956, 9), (46997, 9))

In [54]:
# Count the records of the first user in train
(train.user_id == 1864702).sum()

4302

In [55]:
# Count the records of the first user in train
(validate.user_id == 1864702).sum()

0

In [56]:
# Count the records of the first user in train
(test.user_id == 1864702).sum()

0

### Merge with 1st Part of the new features

In [57]:
# Add new features part one

train = MVP_Davila.sam_train_features(df_train)
validate = MVP_Davila.sam_valtest_features(train, df_validate)
test = MVP_Davila.sam_valtest_features(train, df_test)

train.shape, validate.shape, test.shape

((411517, 13), (50842, 12), (52868, 12))

### Prepare to merge with 2nd part of the new features

In [58]:
# Handle the null values

train = MVP_Shi.handle_null(train)
validate = MVP_Shi.handle_null(validate)
test = MVP_Shi.handle_null(test)

In [59]:
# Handle the inf values

train = MVP_Shi.handle_inf(train)
validate = MVP_Shi.handle_inf(validate)
test = MVP_Shi.handle_inf(test)

In [60]:
# Drop the lecture rows

train = MVP_Shi.drop_lecture_rows(train)
validate = MVP_Shi.drop_lecture_rows(validate)
test = MVP_Shi.drop_lecture_rows(test)

### Merge the 2nd part of the new features

In [61]:
# Merge the new features genereated from Shi

train = MVP_Shi.merge_with_stats_train(train)
validate = MVP_Shi.merge_with_stats_valortest(train, test)
test = MVP_Shi.merge_with_stats_valortest(train, test)

train.shape, validate.shape, test.shape

((403377, 17), (51971, 16), (51971, 16))

### Drop the redundant columns

In [62]:
train = MVP_Shi.drop_columns_train(train)
validate = MVP_Shi.drop_columns_valortest(validate)
test = MVP_Shi.drop_columns_valortest(test)

### Fill the missing values

In [63]:
train.isnull().sum(axis=0)

answered_correctly                0
prior_question_had_explanation    0
user_acc_mean                     0
user_lectures_running_total       0
avg_user_q_time                   0
mean_content_accuracy             0
mean_task_accuracy                0
mean_timestamp_accuracy           0
mean_priortime_accuracy           0
dtype: int64

In [64]:
validate.isnull().sum(axis=0)

answered_correctly                   0
prior_question_had_explanation       0
user_acc_mean                        0
user_lectures_running_total          0
avg_user_q_time                      0
mean_content_accuracy              308
mean_task_accuracy                1413
mean_timestamp_accuracy              0
mean_priortime_accuracy              0
dtype: int64

In [65]:
test.isnull().sum(axis=0)

answered_correctly                   0
prior_question_had_explanation       0
user_acc_mean                        0
user_lectures_running_total          0
avg_user_q_time                      0
mean_content_accuracy              308
mean_task_accuracy                1413
mean_timestamp_accuracy              0
mean_priortime_accuracy              0
dtype: int64

In [66]:
validate = MVP_Shi.fill_nulls(validate)
test = MVP_Shi.fill_nulls(test)

### Scale the columns

In [67]:
columns_to_scale = ['mean_timestamp_accuracy', 'mean_priortime_accuracy',
                    'user_lectures_running_total', 'avg_user_q_time']

scaler, train_scaled, validate_scaled, test_scaled = MVP_Shi.scale(train, 
                                                                   validate, 
                                                                   test, 
                                                                   columns_to_scale)

train_scaled.shape, validate_scaled.shape, test_scaled.shape

((403377, 9), (51971, 9), (51971, 9))

### Convert the boolean values to 0 and 1

In [68]:
train_scaled = MVP_Shi.boolean_to_num(train_scaled)
validate_scaled = MVP_Shi.boolean_to_num(validate_scaled)
test_scaled = MVP_Shi.boolean_to_num(test_scaled)

In [69]:
# Take a peek at the train_scaled
train_scaled.head()

Unnamed: 0,answered_correctly,prior_question_had_explanation,user_acc_mean,mean_content_accuracy,mean_task_accuracy,mean_timestamp_accuracy_scaled,mean_priortime_accuracy_scaled,user_lectures_running_total_scaled,avg_user_q_time_scaled
0,1,0,0.630049,0.818182,0.682248,0.119858,0.515734,0.0,0.002404
1,0,0,0.630049,0.55,0.534988,0.119858,0.515734,0.0,0.002404
2,1,0,0.630049,0.444444,0.445216,0.119858,0.515734,0.0,0.002404
3,1,0,0.630049,0.40625,0.544008,0.119858,0.515734,0.0,0.002404
4,0,0,0.630049,0.6875,0.485282,0.119858,0.515734,0.0,0.002404


In [70]:
# Print the descriptive statistics in train_scaled
train_scaled.describe()

Unnamed: 0,answered_correctly,prior_question_had_explanation,user_acc_mean,mean_content_accuracy,mean_task_accuracy,mean_timestamp_accuracy_scaled,mean_priortime_accuracy_scaled,user_lectures_running_total_scaled,avg_user_q_time_scaled
count,403377.0,403377.0,403377.0,403377.0,403377.0,403377.0,403377.0,403377.0,403377.0
mean,0.650932,0.892944,0.618786,0.650932,0.650932,0.107803,0.517965,0.096597,0.004366
std,0.476676,0.309185,0.114756,0.190522,0.092857,0.137157,0.094209,0.167786,0.016398
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,0.565401,0.528302,0.632065,0.01612,0.457568,0.0,0.000562
50%,1.0,1.0,0.63198,0.666667,0.65711,0.040298,0.528699,0.024,0.001482
75%,1.0,1.0,0.697571,0.793651,0.684564,0.180794,0.582036,0.112,0.003066
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Seperate the features and target

In [71]:
X_train = train_scaled.drop(columns='answered_correctly')
y_train = train_scaled['answered_correctly']

X_validate = validate_scaled.drop(columns='answered_correctly')
y_validate = validate_scaled['answered_correctly']

X_test = test_scaled.drop(columns='answered_correctly')
y_test = test_scaled['answered_correctly']

In [72]:
# Take a peek at the X_train
X_train.head(3)

Unnamed: 0,prior_question_had_explanation,user_acc_mean,mean_content_accuracy,mean_task_accuracy,mean_timestamp_accuracy_scaled,mean_priortime_accuracy_scaled,user_lectures_running_total_scaled,avg_user_q_time_scaled
0,0,0.630049,0.818182,0.682248,0.119858,0.515734,0.0,0.002404
1,0,0.630049,0.55,0.534988,0.119858,0.515734,0.0,0.002404
2,0,0.630049,0.444444,0.445216,0.119858,0.515734,0.0,0.002404


In [73]:
# Take a peak at the y_train
y_train.head(3)

0    1
1    0
2    1
Name: answered_correctly, dtype: int64

### SelectKBest

In [74]:
from sklearn.feature_selection import SelectKBest, f_regression

# parameters: f_regression stats test, give me 4 features
f_selector = SelectKBest(f_regression, k=5)

# find the top 4 X's correlated with y
f_selector.fit(X_train, y_train)

# boolean mask of whether the column was selected or not. 
feature_mask = f_selector.get_support()

# get list of top K features. 
f_feature = X_train.iloc[:,feature_mask].columns.tolist()

f_feature

['prior_question_had_explanation',
 'user_acc_mean',
 'mean_content_accuracy',
 'mean_task_accuracy',
 'mean_timestamp_accuracy_scaled']

### Baseline AUC
- Randomly generated classes
- Most frequent class

### Use the randomly generated classes as the baseline

In [75]:
# Create the y_predicted on y_train

size = y_train.size
print(size)
y_train

403377


0         1
1         0
2         1
3         1
4         0
         ..
403372    1
403373    0
403374    0
403375    0
403376    1
Name: answered_correctly, Length: 403377, dtype: int64

In [76]:
y_baseline = np.random.randint(2, size=size)
y_baseline

array([1, 1, 1, ..., 1, 1, 1])

In [77]:
roc_auc_score(y_train, y_baseline)

0.5004770760056451

### Use the most frequent calss as the baseline

In [78]:
y_train.value_counts(normalize=True)

1    0.650932
0    0.349068
Name: answered_correctly, dtype: float64

In [79]:
y_train

0         1
1         0
2         1
3         1
4         0
         ..
403372    1
403373    0
403374    0
403375    0
403376    1
Name: answered_correctly, Length: 403377, dtype: int64

In [80]:
y_baseline2 = np.array([1]*size)
y_baseline2

array([1, 1, 1, ..., 1, 1, 1])

In [81]:
roc_auc_score(y_train, y_baseline2)

0.5

### Modeling on various classificaiton algorithms

In [82]:
# Create a list of names of the classifiers

names = ['LogisticRegression', # predict_proba: yes
         'Decision Tree', # predict_proba: yes
         'Random Forest', # predict_proba: yes
         'Ada Boost', # predict_proba: yes
         'Gradient Boost', # predict_proba: yes
         'Nearest Neighbors', # predict_proba: yes
         'Naive Bayes', # predict_proba: yes
         'Neural Net' # predict_proba: yes
        ]

In [83]:
# Create a list of classifiers

classifiers = [LogisticRegression(), 
               DecisionTreeClassifier(max_depth=3),
               RandomForestClassifier(max_depth=3),
               AdaBoostClassifier(),
               GradientBoostingClassifier(),
               KNeighborsClassifier(3),
               GaussianNB(), 
               MLPClassifier()
              ]

In [84]:
# The size between the two should be the same
len(names) == len(classifiers)

True

In [85]:
metrics = MVP_Shi.model_multiple_algos(names, classifiers, X_train, y_train, X_validate, y_validate, X_test, y_test)
metrics

Currently runnig on model LogisticRegression
Currently runnig on model Decision Tree
Currently runnig on model Random Forest
Currently runnig on model Ada Boost
Currently runnig on model Gradient Boost
Currently runnig on model Nearest Neighbors
Currently runnig on model Naive Bayes
Currently runnig on model Neural Net


Unnamed: 0,AUC score,Algo,dataset
0,0.774688,LogisticRegression,train
1,0.690636,LogisticRegression,validate
2,0.690636,LogisticRegression,test
3,0.741229,Decision Tree,train
4,0.675218,Decision Tree,validate
5,0.675218,Decision Tree,test
6,0.753854,Random Forest,train
7,0.690931,Random Forest,validate
8,0.690931,Random Forest,test
9,0.776008,Ada Boost,train


In [88]:
# Rank the accuracy on the test dataset
mask = (metrics.dataset == "test")
metrics[mask].sort_values(by="AUC score", ascending=False)

Unnamed: 0,AUC score,Algo,dataset
8,0.690931,Random Forest,test
2,0.690636,LogisticRegression,test
14,0.687884,Gradient Boost,test
23,0.687168,Neural Net,test
11,0.676711,Ada Boost,test
20,0.675848,Naive Bayes,test
5,0.675218,Decision Tree,test
17,0.626189,Nearest Neighbors,test


**Takeaways**
- The highest score on test is Random Forest.