In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.style as style
# style.use('fivethirtyeight')
import seaborn as sns

In [None]:
# load first 10*5 rows of train data
folder = '../input/riiid-test-answer-prediction/'
train = pd.read_csv(folder + 'train.csv', low_memory=False, nrows=10**5,
                       dtype={'row_id': 'int64', 'timestamp': 'int64', 'user_id': 'int32', 'content_id': 'int16', 'content_type_id': 'int8',
                              'task_container_id': 'int16', 'user_answer': 'int8', 'answered_correctly': 'int8', 'prior_question_elapsed_time': 'float32',
                             'prior_question_had_explanation': 'boolean'})



In [None]:
train.info()

In [None]:
# drop the lecture and first question bundle (Typically the first several questions a user sees were part of an onboarding diagnostic test where they did not get any feedback.)
train = train.dropna()

# covert prior_question_had_explanation data type False,True to 0,1
# train['prior_question_had_explanation'] = train.prior_question_had_explanation.astype('int8')

# Feature engineering

## Select features

1. collect all possible features in a dataframe.
2. normalize feature values.
3. try and decide feature importance.

First, make a dataframe to hold all the values of possible features.

### By user
1. mean (answered_correctly)
2. number of questions answered
3. total time spent on APP

In [None]:
# user accuracy and # of question answered
user_correct = train.groupby('user_id')['answered_correctly'].agg(user_sum = 'sum', 
                user_mean = 'mean').reset_index()
# total time spent on riiid APP of each user
user_time_total = train.groupby('user_id')['timestamp'].agg(user_time_total = 'max')

# merge
user_df = user_correct.merge(user_time_total, on = 'user_id')

### Other features from train data
1. answered_correctly
2. prior_question_elapsed_time
3. prior_question_had_explanation

In [None]:
# only select the needed columns
train = train[['user_id', 'content_id', 'answered_correctly','prior_question_elapsed_time', 'prior_question_had_explanation']]

### By question
1. mean accuracy of each question
2. number of questions answered
3. mean accuracy of question part(section of the TOEIC test)


In [None]:
# load question data
question = pd.read_csv(folder + 'questions.csv')

In [None]:
# merge question to train
train = train.merge(question[['question_id', 'part']],
            left_on ='content_id', right_on = 'question_id', how = 'left')\
            .drop('question_id', axis = 1)

In [None]:
# get question mean and part mean
question_mean = train.groupby('content_id')['answered_correctly']\
                .agg( question_mean = 'mean', question_sum = 'sum').reset_index()
part_mean = train.groupby('part')['answered_correctly'].agg( part_mean = 'mean').reset_index()

In [None]:
# merge question mean, part mean to train
train_df = train.merge(question_mean,on ='content_id', how = 'left')\
            .drop('content_id', axis = 1)
train_df = train_df.merge(part_mean, on = 'part', how ='left')\
            .drop('part', axis = 1)

In [None]:
# merge user_df to train
train_df = train_df.merge(user_df, on = 'user_id', how ='left')\
            .drop('user_id', axis = 1)

In [None]:
train_df.shape # same rows as original train data

In [None]:
train_df.head()

# Normalize inputs(features) ？

In [None]:
# feature_names = list(train_df.columns)[1:]
# for i in feature_names:
#     print(i)
#     train_df[i] = train_df[i] / train_df[i].std()

### Feature distributions

In [None]:
# plot histogram to show distribution of features by answered_correctly categories
def plot_histogram_answered_correctly(x,y):
    plt.hist(list(x[y==0]), alpha = 0.5, label='answered_correctly = 0')
    plt.hist(list(x[y==1]), alpha = 0.5, label='answered_correctly = 1')
    plt.title("Histogram of '{var_name}'".format(var_name = x.name))
    plt.xlabel('Value')
    plt.ylabel('Frequency')
    plt.legend()
    plt.show()

In [None]:
plot_histogram_answered_correctly(train_df['question_mean'], train_df['answered_correctly'])

# Feature outliers detection
## ****Detect outliers
First, let's define a function to find all the possible outliers.

### IQR (inter quartile range)
Equation: InterQuartile Range = Q3 - Q1

The first quartile, denoted Q1, is the value in the data set that holds 25% of the values below it.

The third quartile, denoted Q3, is the value in the data set that holds 25% of the values above it.

Outliers and Tukey Fences: Tukey is one of th emethods fro determing outliers in a sample. It is very popular method is based on the following:

outliers = below (Q1 - 1.5(Q3-Q1)) or above (Q3 + 1.5(Q3-Q1))

In [None]:
# define a func to find outliers
def find_outliers(x):
    q1 =  data.quantile(q=.25)
    q3 =  data.quantile(q=.75)
    iqr = q3-q1
    floor = q1 - 1.5*iqr
    ceiling = q3 + 1.5*iqr
    outlier_indices = list(x.index[(x < floor)|(x > ceiling)])
    outlier_values = list(x[outlier_indices])
    
    return outlier_indices, outlier_values

In [None]:
# find outliers in each column
# try different features
outlier_index = []

for c in train_df.columns:
    data = train_df[c]
    outlier_indices, outlier_values = find_outliers(data) 
    outlier_index.extend(outlier_indices)
    print('Ther are ', len(outlier_values), ' outliers in', c)

In [None]:
# interpret the outliers
print('Ther are a total of {} outliers in train_df, accounting for {}% of the total train_df data ({}).'\
      .format(len(outlier_index),round(100*(len(outlier_index)/len(train_df)),2), len(train_df)))

### Remove outliers
Now, let's remove all the outliers and get a clean data frame ready for next step.

In [None]:
# remove outliers from feature dataframe
cleaned_train_df = train_df.drop(train_df.index[outlier_index])

In [None]:
# check the shape 
cleaned_train_df.shape

In [None]:
train_df.shape

# Machine Learning
## Feature selection and model building
### Split data to train and test data

In [None]:
# split data into train and test sets
from sklearn.model_selection import train_test_split

# get X, y data
data = cleaned_train_df
X = data.drop('answered_correctly', axis=1)
y = data.answered_correctly

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=.3, random_state = 0)

In [None]:
X.shape

### Feature selection

In [None]:
# with such a large of features(9 in total) an cause overfitting and also slow computing
# use feature selection to select the most important features
import sklearn.feature_selection

In [None]:
def select_features(X, X_train, y_train, k):
    if k > len(X):
        print('K must less than the lence of input X')
    select = sklearn.feature_selection.SelectKBest(k=k)
    selected_features = select.fit(X_train, y_train)
    indices_selected = selected_features.get_support(indices = True)
    colnames_selected = [X.columns[i] for i in indices_selected]
    return colnames_selected

In [None]:
colnames_selected = select_features(X, X_train, y_train, 3)

In [None]:
colnames_selected

In [None]:
# use the selected feature to fit model
X_train_selected = X_train[colnames_selected]
X_test_selected = X_test[colnames_selected]

### Build model using cleaned data (without outliers)

In [None]:
# Perform Logistic Regression
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()
log_reg.fit(X_train_selected, y_train)

# Make prediction using the model
y_pred = log_reg.predict(X_test_selected) 

### Evaluate model
#### Evaluate Model --- ROC Curve
Now, we can plot the ROC (Receiver Operating Characteristic) Curve which displays the percentage of true positives predicted by the model as the prediction probability cutoff is lowered from 1 to 0.

The higher the AUC (area under the curve), the more accurately our model is able to predict outcomes:

In [None]:
# quickly visualize results
from sklearn import metrics
def ROC_Curve(log_model, X_test, y_test):
    #define metrics
    y_pred_proba = log_model.predict_proba(X_test)[::,1]
    fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
    auc = metrics.roc_auc_score(y_test, y_pred_proba)

    #create ROC curve
    plt.plot(fpr,tpr,label="AUC="+str(auc))
    plt.legend(loc=4)
    plt.show()

In [None]:
ROC_Curve(log_reg, X_test_selected, y_test)

#### Evaluate Model --- ROC & AUC Score

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, roc_auc_score

def find_model_perf(X_train, X_test, y_train, y_test):
    # Perform Logistic Regression
    log_reg = LogisticRegression()
    log_reg.fit(X_train, y_train)
    y_pred = [x[1] for x in log_reg.predict_proba(X_test)]
    auc = roc_auc_score(y_test, y_pred)  ## ?? roc_auc_score vs accuraccy
    return auc

In [None]:
auc_processed = find_model_perf(X_train_selected, X_test_selected, y_train, y_test)
print(auc_processed)

#### Evaluate Model --- Model Scores

In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score

def estimate_logreg_scores(y_test, y_pred):

    print('\nLogistic Regression Report')
    print('\nUsing 0.3 as test size:')
    print('Accuracy = {:.5f}'.format(accuracy_score(y_test, y_pred)))
    print('Precision = {:.5f}'.format(precision_score(y_test, y_pred)))
    print('Recall = {:.5f}'.format(recall_score(y_test, y_pred)))

In [None]:
estimate_logreg_scores(y_test, y_pred)

#### Evaluate Model --- Confusion Matrix

In [None]:
from sklearn.metrics import plot_confusion_matrix

def confusion_matrix_plot(log_model,  X_test, y_test):
    plot_confusion_matrix(log_model, X_test, y_test,
                             cmap=plt.cm.Blues);

In [None]:
# Display the Confusion Matrix
confusion_matrix(y_test, y_pred)

In [None]:
confusion_matrix_plot(log_reg, X_test_selected, y_test)

#### Feature coefficients

In [None]:
# interpret feature coefficients 
coefficients = np.hstack((log_reg.intercept_, log_reg.coef_[0]))
interpret_result = pd.DataFrame(data={'variable': ['intercept'] + list(X_train_selected.columns), 
                   'coefficient': coefficients}).sort_values('coefficient', ascending = False)
interpret_result

Note: How to interpret the coefficient? need scale or standardize the numerical values?

## Make logistic regression model with outliers

In [None]:
# get X, y data
data = train_df
X_outlier = data.drop('answered_correctly', axis=1)
y_outlier = data.answered_correctly

# Split the dataset
X_train_outlier, X_test_outlier, y_train_outlier, y_test_outlier = train_test_split(
       X_outlier, y_outlier, test_size=.3, random_state = 0)

In [None]:
X_outlier.shape

In [None]:
colnames_selected_outlier = select_features(X_outlier, X_train_outlier, y_train_outlier, 2)

In [None]:
X.head()

In [None]:
# colnames_selected_outlier

In [None]:
colnames_selected_outlier = ['user_mean','question_mean','question_sum','part_mean',
       'prior_question_had_explanation']

In [None]:
# use the selected feature to fit model
X_train_selected_outlier = X_train_outlier[colnames_selected_outlier]
X_test_selected_outlier = X_test_outlier[colnames_selected_outlier]

In [None]:
# Perform Logistic Regression
from sklearn.linear_model import LogisticRegression

log_reg_outlier = LogisticRegression()
log_reg_outlier.fit(X_train_selected_outlier, y_train_outlier)

# Make prediction using the model
y_pred_outlier = log_reg_outlier.predict(X_test_selected_outlier) 

In [None]:

ROC_Curve(log_reg_outlier, X_test_selected_outlier, y_test_outlier)

In [None]:
find_model_perf(X_train_selected_outlier, X_test_selected_outlier, y_train_outlier, y_test_outlier)

In [None]:
estimate_logreg_scores(y_test_outlier, y_pred_outlier)

In [None]:
# Display the Confusion Matrix
confusion_matrix(y_test_outlier, y_pred_outlier)

In [None]:
print('Three features with outliers')
confusion_matrix_plot(log_reg_outlier, X_test_selected_outlier, y_test_outlier)
