In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
"""
Created on Mon Oct  5 22:50:40 2020

@author: André
"""
# =============================================================================
# IMPORT NECESSARY LIBRARIES
# =============================================================================

#Core libraries
import pandas as pd
import numpy as np
import math

#Data loading
from sklearn.model_selection import train_test_split

#Preprocessing
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

#Modeling
from sklearn.ensemble import RandomForestRegressor

#Pipeline
from sklearn.pipeline import Pipeline

#Validation
import sklearn.metrics as metrics
from sklearn.metrics import roc_auc_score

#Visualization
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# =============================================================================
# DATA LOADING
# =============================================================================

#Filepath of relevant files
train_path = '../input/riiid-test-answer-prediction/train.csv'
questions_path = '../input/riiid-test-answer-prediction/questions.csv'
lectures_path = '../input/riiid-test-answer-prediction/lectures.csv'

#Load relevant files
train_df = pd.read_csv(train_path, nrows = 100000).set_index('row_id') #train.csv too big, lets load 100k rows for now
train_df = train_df.loc[train_df['content_type_id'] == 0] #this is also a bit of a hack
y = train_df['answered_correctly']
train_df = train_df.drop('answered_correctly', axis = 1)
 
questions_df = pd.read_csv(questions_path).set_index('question_id')
lectures_df = pd.read_csv(lectures_path).set_index('lecture_id')


#Create calculated dataframes and calculated columns
user_features = ['user_id', 'user_rating']
users_df = pd.DataFrame(columns = user_features).set_index('user_id')
questions_df['question_rating'] = np.NaN

#Train/test split
X_train, X_valid, y_train, y_valid = train_test_split(train_df, y)

In [None]:
# =============================================================================
# PREPROCESSING
# =============================================================================

identity_transformer = SimpleImputer(strategy = 'constant', fill_value = 0)

rating_transformer = SimpleImputer(strategy = 'constant', fill_value = 500)

train_preprocessor = ColumnTransformer(
        transformers = [
                ('id', identity_transformer, ['user_id', 'content_id', 'content_type_id'])
                ])

test_preprocessor = ColumnTransformer(
        transformers = [
                ('id', identity_transformer, ['row_id', 'user_id', 'content_id', 'content_type_id'])
                ])

questions_preprocessor = ColumnTransformer(
        transformers = [
                ('rating', rating_transformer, ['question_rating'])
                ])

train_features = ['user_id', 'content_id', 'content_type_id']
test_features = ['row_id', 'user_id', 'content_id', 'content_type_id']
question_features = ['question_rating']
features = ['user_rating', 'question_rating']

In [None]:
# =============================================================================
# FEATURE CREATING
# =============================================================================

def train_rating_model(X, y, users, questions):
      
    for index, row in X.iterrows():
        
        #Calculate expected outcoume
        ur = users.loc[row['user_id'], 'user_rating']
        cr = questions.loc[row['content_id'], 'question_rating']
        ea = 1/(1 + math.pow(10, (cr - ur)/400))
        
        #Update rating based on real values
        users.loc[row['user_id'], 'user_rating'] += 32*(y.loc[index] - ea)
        questions.loc[row['content_id'], 'question_rating'] += 32*(ea - y.loc[index])
        
    return users, questions


def reg_new_users_and_questions(X, users, questions):
    
    for index, row in X.iterrows():
        
        #If user is new, add to dataframe
        if not (row['user_id'] in users.index):
            new_user = pd.DataFrame({"user_rating": 500}, index=[row['user_id']])
            #FIX: this is bad, because I need to recode everytime I change features
            
            users = pd.concat([users, new_user])
            
        if not (row['content_id'] in questions.index):
            new_question = pd.DataFrame({"question_rating": 392}, index=[row['content_id']])
            #FIX: this is bad, because I need to recode everytime I change features
            
            questions = pd.concat([questions, new_question])

    return users, questions


def update_ratings(X, y, preds, users, questions):
    
    for index, row in X.iterrows():
        
        ea = preds[index]
        users.loc[row['user_id'], 'user_rating'] += 32*(y.loc[index] - ea)
        questions.loc[row['content_id'], 'question_rating'] += 32*(ea - y.loc[index])
        
    return users, questions

In [None]:
# =============================================================================
# MODELING FUNCTIONS
# =============================================================================


def predict_based_on_rating(X):
    
    preds = []
    for index, row in X.iterrows():
        preds.append(1/(1 + math.pow(10, (row['question_rating'] - row['user_rating'])/400)))
    
    return pd.Series(data = preds, index = X.index)
    

model_1 = RandomForestRegressor()

In [None]:
# =============================================================================
# TRAINING
# =============================================================================

#Preprocess data
X_train = pd.DataFrame(train_preprocessor.fit_transform(X_train), index = X_train.index ,columns = train_features)
questions_df = pd.DataFrame(questions_preprocessor.fit_transform(questions_df), index = questions_df.index, columns = question_features)


#Construct features
users_df, questions_df = reg_new_users_and_questions(X_train, users_df, questions_df)
users_df, questions_df = train_rating_model(X_train, y_train, users_df, questions_df)
users_df, questions_df = train_rating_model(X_train, y_train, users_df, questions_df) #double time :D

#Train model
X_train = X_train.join(users_df, on = 'user_id')
X_train = X_train.join(questions_df, on = 'content_id')
model_1 = model_1.fit(X_train[features], y_train)

In [None]:
# =============================================================================
# SUBMISSION
# =============================================================================

# I should be trainign my model with the whole data before proceeding

#first iter
import riiideducation
env = riiideducation.make_env()

iter_test = env.iter_test()

(test_df, sample_prediction_df) = next(iter_test)

#Preprocess data
test_df = pd.DataFrame(test_preprocessor.fit_transform(test_df), index = test_df.index, columns = test_features)

#Register new users and questions
users_val, questions_val = reg_new_users_and_questions(test_df, users_df, questions_df)
    
#Make predictions
test_df = test_df.join(users_val, on = 'user_id')
test_df = test_df.join(questions_val, on = 'content_id')
preds = model_1.predict(test_df[features])
    
#submit_predictions
test_df['answered_correctly'] = preds
env.predict(test_df.loc[(test_df['content_type_id'] == 0), ['row_id', 'answered_correctly']])

last_test_df = test_df

for (test_df, sample_prediction_df) in iter_test:
    
    #Get previous batch answers_correct
    y = test_df['prior_group_answers_correct'].iloc[0,]
    y = pd.Series([int(i) for i in y[1:-1].split(', ')], index = last_test_df.index)
    
    #Update ratings
    #users_val, questions_val = update_ratings(last_test_df, y, preds, users_val, questions_val)
    
    #Preprocess data
    test_df = pd.DataFrame(test_preprocessor.fit_transform(test_df), index = test_df.index, columns = test_features)

    #Register new users and questions
    users_val, questions_val = reg_new_users_and_questions(test_df, users_val, questions_val)
    
    #Make predictions
    test_df = test_df.join(users_val, on = 'user_id')
    test_df = test_df.join(questions_val, on = 'content_id')
    preds = model_1.predict(test_df[features])
    
    #submit_predictions
    test_df['answered_correctly'] = preds
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])

    last_test_df = test_df