In [1]:
import pandas as pd
import numpy as np

### Läs av datasettet som tillhandahålls

In [2]:
train = pd.read_csv(
    './raw_data/train.csv', 
    low_memory=False,
    nrows=2000,
)

### Optimerar minnet med hjälp av mindre datatyper

In [3]:
train = train.dropna()
train['timestamp'] = train['timestamp'].astype('int64')
train['user_id'] = train['user_id'].astype('int32')
train['content_id'] = train['content_id'].astype('int16')
train['task_container_id'] = train['task_container_id'].astype('int16')
train['user_answer'] = train['user_answer'].astype('int8')
train['answered_correctly'] = train['answered_correctly'].astype('int32')
train['prior_question_had_explanation'] = train['prior_question_had_explanation'].astype('boolean')

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1969 entries, 1 to 1999
Data columns (total 10 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   row_id                          1969 non-null   int64  
 1   timestamp                       1969 non-null   int64  
 2   user_id                         1969 non-null   int32  
 3   content_id                      1969 non-null   int16  
 4   content_type_id                 1969 non-null   int64  
 5   task_container_id               1969 non-null   int16  
 6   user_answer                     1969 non-null   int8   
 7   answered_correctly              1969 non-null   int32  
 8   prior_question_elapsed_time     1969 non-null   float64
 9   prior_question_had_explanation  1969 non-null   boolean
dtypes: boolean(1), float64(1), int16(2), int32(2), int64(3), int8(1)
memory usage: 105.8 KB


### Delar upp datasettet 

In [5]:
features = train.iloc[:int(9/10 * len(train))]
train = train.iloc[int(9/10 * len(train)):]

### Vi börjar med att ta bort 'lectures' (answered_correctly == 0 eller 1 betyder question, answered_correctly == -1 betyder lecture)
### Sedan grupperar vi (agg) mean, count, standardvalue, median, avvikelse för användar-id.
### Sedan skapar vi motsvarande kolumner för dessa värden. 

In [6]:
train_questions_only_df = features[features['answered_correctly'] != -1]
grouped_by_user_df = train_questions_only_df.groupby('user_id')

user_answers_df = grouped_by_user_df.agg(
    {
        'answered_correctly': [
            'mean', 
            'count', 
            'std', 
            'median', 
            'skew'
        ]
    }
).copy()

user_answers_df.columns = [
    'mean_user_accuracy', 
    'questions_answered', 
    'std_user_accuracy', 
    'median_user_accuracy', 
    'skew_user_accuracy'
]

user_answers_df

Unnamed: 0_level_0,mean_user_accuracy,questions_answered,std_user_accuracy,median_user_accuracy,skew_user_accuracy
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
115,0.688889,45,0.468179,1,-0.844439
124,0.206897,29,0.412251,0,1.527297
2746,0.611111,18,0.501631,1,-0.498374
5382,0.669355,124,0.472354,1,-0.728823
8623,0.638889,108,0.482562,1,-0.586492
8701,0.5625,16,0.512348,1,-0.278829
12741,0.575758,264,0.495166,1,-0.308324
13134,0.704623,1168,0.456407,1,-0.898208


### Vi grupperar content-id
### Sedan grupperar vi (agg) mean, count, standardvalue, median, avvikelse för content-id.
### Sedan skapar vi motsvarande kolumner för dessa värden.

In [7]:
grouped_by_content_df = train_questions_only_df.groupby('content_id')
content_answers_df = grouped_by_content_df.agg(
    {
        'answered_correctly': [
            'mean', 
            'count', 
            'std', 
            'median', 
            'skew'
        ]
    }
).copy()

content_answers_df.columns = [
    'mean_accuracy', 
    'question_asked', 
    'std_accuracy', 
    'median_accuracy', 
    'skew_accuracy'
]

content_answers_df

Unnamed: 0_level_0,mean_accuracy,question_asked,std_accuracy,median_accuracy,skew_accuracy
content_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2,0.0,1,,0.0,
4,0.0,1,,0.0,
5,0.0,1,,0.0,
6,1.0,1,,1.0,
9,0.0,2,0.0,0.0,
...,...,...,...,...,...
13503,0.0,1,,0.0,
13505,1.0,1,,1.0,
13510,1.0,1,,1.0,
13519,1.0,1,,1.0,


### Din stackars dator pallar inte trycket så vi deletar allokerat minne åt dig <3

In [8]:
del features
del grouped_by_user_df
del grouped_by_content_df

### Vi skapar nya Features och en ny Target som våran algoritm ska jobba med. 

In [9]:
features = [
    'mean_user_accuracy', 
    'questions_answered',
    'std_user_accuracy', 
    'median_user_accuracy',
    'skew_user_accuracy',
    'mean_accuracy', 
    'question_asked',
    'std_accuracy', 
    'median_accuracy',
    'prior_question_had_explanation',
    'skew_accuracy'
]

target = 'answered_correctly'

### Kontrollerar att vi inte har några -1 värden i target. (Då är det lectures)

In [10]:
train = train[train[target] != -1]

### Vi mergar user_answers och content_answers dataframes. 
### Det förekommer nullvärden efter våran merge, så vi fyller dom.

In [11]:
train = train.merge(user_answers_df, how='left', on='user_id')
train = train.merge(content_answers_df, how='left', on='content_id')
fmean = train.mean()
fmean['prior_question_had_explanation'] = round(fmean['prior_question_had_explanation'])
train = train.fillna(fmean)
train.isna().sum()

row_id                            0
timestamp                         0
user_id                           0
content_id                        0
content_type_id                   0
task_container_id                 0
user_answer                       0
answered_correctly                0
prior_question_elapsed_time       0
prior_question_had_explanation    0
mean_user_accuracy                0
questions_answered                0
std_user_accuracy                 0
median_user_accuracy              0
skew_user_accuracy                0
mean_accuracy                     0
question_asked                    0
std_accuracy                      0
median_accuracy                   0
skew_accuracy                     0
dtype: int64

### Vi delar dataframen i den datan som ska bearbetas (x) och (y) som i sin tur ska motsvara förutsägelse av 'answered_correctly'

In [12]:
X = train[features] 
y = train[target] 

### Vi splittar vårat data för våra algoritmer

In [13]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)

In [14]:
from sklearn.linear_model import LogisticRegression

### Vi använder oss utav 'LogisticRegression', vi ger den datan att behandla
### Och skriver ut det förutsatta resultatet av 'answered_correctly'

In [15]:

classifier = LogisticRegression()
classifier.fit(X_train, y_train.values.ravel())
print('LogisticRegression:', classifier.score(X_test, y_test))

LogisticRegression: 0.6666666666666666


In [16]:
from sklearn.tree import DecisionTreeClassifier

### Vi använder oss utav 'DecisionTreeClassifier', vi ger den datan att behandla
### Och skriver ut det förutsatta resultatet av 'answered_correctly'

In [17]:
classifier = DecisionTreeClassifier()
classifier.fit(X_train, y_train.values.ravel())
print('Decision Tree Result:', classifier.score(X_test, y_test))

Decision Tree Result: 0.6666666666666666


In [18]:
from sklearn.svm import SVC

### Vi använder oss utav 'SVC', vi ger den datan att behandla
### Och skriver ut det förutsatta resultatet av 'answered_correctly'

In [19]:
classifier = SVC()
classifier.fit(X_train, y_train.values.ravel())
print('SVC Result:', classifier.score(X_test, y_test))

SVC Result: 0.6666666666666666


In [24]:
train.head()

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,mean_user_accuracy,questions_answered,std_user_accuracy,median_user_accuracy,skew_user_accuracy,mean_accuracy,question_asked,std_accuracy,median_accuracy,skew_accuracy
0,1800,17508330877,13134,3340,0,980,3,1,33000.0,True,0.704623,1168.0,0.456407,1.0,-0.898208,0.392361,1.625,0.327602,0.385417,0.216506
1,1801,17515647346,13134,10666,0,981,2,1,28666.0,True,0.704623,1168.0,0.456407,1.0,-0.898208,0.392361,1.625,0.327602,0.385417,0.216506
2,1802,17515688927,13134,10512,0,982,1,0,24000.0,True,0.704623,1168.0,0.456407,1.0,-0.898208,0.392361,1.625,0.327602,0.385417,0.216506
3,1803,17515722537,13134,12655,0,983,2,1,30000.0,True,0.704623,1168.0,0.456407,1.0,-0.898208,0.392361,1.625,0.327602,0.385417,0.216506
4,1804,17515753849,13134,10534,0,984,2,1,24000.0,True,0.704623,1168.0,0.456407,1.0,-0.898208,0.392361,1.625,0.327602,0.385417,0.216506


## QUESTIONS

In [25]:
questions = pd.read_csv('./raw_data/questions.csv')

In [28]:
questions.head()

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags
0,0,0,0,1,51 131 162 38
1,1,1,1,1,131 36 81
2,2,2,0,1,131 101 162 92
3,3,3,0,1,131 149 162 29
4,4,4,3,1,131 5 162 38
