## Vår analys fokuserar på att hitta korrelationer kring våra dataset.
#### Vi har försökt se till hur stor del man kan förutspå om en user kommer svara rätt på en fråga eller ej

In [1]:
import pandas as pd
import numpy as np

### Läs av datasettet som tillhandahålls

In [2]:
train = pd.read_csv(
    './raw_data/train.csv', 
    low_memory=False,
    nrows=10_000_000,
)


In [3]:
questions = pd.read_csv(
    './raw_data/questions.csv', 
    low_memory=False
)

### Optimerar minnet med hjälp av mindre datatyper

In [4]:
train = train.dropna()
train['timestamp'] = train['timestamp'].astype('int64')
train['user_id'] = train['user_id'].astype('int32')
train['content_id'] = train['content_id'].astype('int16')
train['task_container_id'] = train['task_container_id'].astype('int16')
train['user_answer'] = train['user_answer'].astype('int8')
train['answered_correctly'] = train['answered_correctly'].astype('int32')
train['prior_question_had_explanation'] = train['prior_question_had_explanation'].astype('boolean')


In [5]:
train.corr()

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
row_id,1.0,-0.003198,0.999924,-0.00231,,0.028774,-1.8e-05,0.005872,-0.002504,0.002279
timestamp,-0.003198,1.0,-0.003364,0.053272,,0.413843,-0.00046,0.031593,0.013781,0.167653
user_id,0.999924,-0.003364,1.0,-0.002333,,0.02845,-2.7e-05,0.005766,-0.002584,0.0023
content_id,-0.00231,0.053272,-0.002333,1.0,,0.048702,0.012908,-0.023083,0.152266,0.037467
content_type_id,,,,,,,,,,
task_container_id,0.028774,0.413843,0.02845,0.048702,,1.0,0.002253,0.063482,-0.016098,0.167548
user_answer,-1.8e-05,-0.00046,-2.7e-05,0.012908,,0.002253,1.0,0.005886,0.014257,-0.011714
answered_correctly,0.005872,0.031593,0.005766,-0.023083,,0.063482,0.005886,1.0,-0.008253,0.106901
prior_question_elapsed_time,-0.002504,0.013781,-0.002584,0.152266,,-0.016098,0.014257,-0.008253,1.0,-0.000534
prior_question_had_explanation,0.002279,0.167653,0.0023,0.037467,,0.167548,-0.011714,0.106901,-0.000534,1.0


### Delar upp datasettet 

In [6]:
features = train.iloc[:int(9/10 * len(train))]
train = train.iloc[int(9/10 * len(train)):]

### Vi börjar med att ta bort 'lectures' (answered_correctly == 0 eller 1 betyder question, answered_correctly == -1 betyder lecture)
### Sedan grupperar vi (agg) mean, count, standardvalue, median och avvikelse för användar-id.
### Sedan skapar vi motsvarande kolumner för dessa värden. 

In [7]:
train_questions_only_df = features[features['answered_correctly'] != -1]

### Vi tar bort lectures för att kunna fokusera på questions villket leder till att korelationen ökar mellan timestamp och part (från 0.007 till 0.023)

### Lägger till part från questions datasettet

In [8]:
train = pd.merge(train, questions[['question_id', 'part']], how='left', left_on=['content_id'], right_on=['question_id'])
train['part'] = train['part'].astype('int16')
train = train.drop(columns=['question_id'])
train.head()

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,part
0,9000814,101829438,196145707,8998,0,102,0,1,18000.0,True,5
1,9000815,102282621,196145707,5264,0,103,1,1,17000.0,True,5
2,9000816,102558854,196145707,5335,0,104,0,0,30000.0,True,5
3,9000817,106434960,196145707,6451,0,105,0,1,61000.0,True,5
4,9000818,106516904,196145707,9640,0,106,0,0,30000.0,True,5


In [9]:
train.corr()

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,part
row_id,1.0,-0.072921,0.998625,-0.018531,,-0.037652,0.000474,0.003389,-0.000729,0.008009,-0.006534
timestamp,-0.072921,1.0,-0.073136,0.07479,,0.423184,-9.1e-05,0.039285,-0.01048,0.147883,0.025467
user_id,0.998625,-0.073136,1.0,-0.017751,,-0.038735,0.000507,0.003053,-0.000161,0.007533,-0.005982
content_id,-0.018531,0.07479,-0.017751,1.0,,0.040718,0.01473,-0.030344,0.146809,0.032563,0.547582
content_type_id,,,,,,,,,,,
task_container_id,-0.037652,0.423184,-0.038735,0.040718,,1.0,0.001041,0.07459,-0.03069,0.158055,-0.017308
user_answer,0.000474,-9.1e-05,0.000507,0.01473,,0.001041,1.0,0.006648,0.013884,-0.01257,0.031107
answered_correctly,0.003389,0.039285,0.003053,-0.030344,,0.07459,0.006648,1.0,-0.013716,0.110825,-0.085701
prior_question_elapsed_time,-0.000729,-0.01048,-0.000161,0.146809,,-0.03069,0.013884,-0.013716,1.0,-0.006678,0.238841
prior_question_had_explanation,0.008009,0.147883,0.007533,0.032563,,0.158055,-0.01257,0.110825,-0.006678,1.0,-0.039081


In [10]:
train_questions_only_df = features[features['answered_correctly'] != -1]
grouped_by_user_df = train_questions_only_df.groupby('user_id')

user_answers_df = grouped_by_user_df.agg(
    {
        'answered_correctly': [
            'mean', 
            'count', 
            'std', 
            'median', 
            'skew'
        ]
    }
).copy()

user_answers_df.columns = [
    'mean_user_accuracy', 
    'questions_answered', 
    'std_user_accuracy', 
    'median_user_accuracy', 
    'skew_user_accuracy'
]

user_answers_df.head()

Unnamed: 0_level_0,mean_user_accuracy,questions_answered,std_user_accuracy,median_user_accuracy,skew_user_accuracy
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
115,0.688889,45,0.468179,1.0,-0.844439
124,0.206897,29,0.412251,0.0,1.527297
2746,0.611111,18,0.501631,1.0,-0.498374
5382,0.669355,124,0.472354,1.0,-0.728823
8623,0.638889,108,0.482562,1.0,-0.586492


### Vi grupperar content-id
### Sedan grupperar vi (agg) mean, count, standardvalue, median, avvikelse för content-id.
### Sedan skapar vi motsvarande kolumner för dessa värden.

In [11]:
grouped_by_content_df = train_questions_only_df.groupby('content_id')
content_answers_df = grouped_by_content_df.agg(
    {
        'answered_correctly': [
            'mean', 
            'count', 
            'std', 
            'median', 
            'skew'
        ]
    }
).copy()

content_answers_df.columns = [
    'mean_accuracy', 
    'question_asked', 
    'std_accuracy', 
    'median_accuracy', 
    'skew_accuracy'
]

content_answers_df.head()

Unnamed: 0_level_0,mean_accuracy,question_asked,std_accuracy,median_accuracy,skew_accuracy
content_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.901431,629,0.29832,1.0,-2.699862
1,0.889571,652,0.313665,1.0,-2.491635
2,0.558563,4064,0.49662,1.0,-0.235963
3,0.779678,2047,0.414565,1.0,-1.350577
4,0.627244,2841,0.483623,1.0,-0.526582


### Vi deletar data för att fria upp alokerat minne

In [12]:
del features
del grouped_by_user_df
del grouped_by_content_df

### Vi skapar nya Features och en ny Target som våran algoritm ska jobba med. 

In [13]:
features = [
    'mean_user_accuracy', 
    'questions_answered',
    'std_user_accuracy', 
    'median_user_accuracy',
    'skew_user_accuracy',
    'mean_accuracy', 
    'question_asked',
    'std_accuracy', 
    'median_accuracy',
    'prior_question_had_explanation',
    'skew_accuracy'
]

target = 'answered_correctly'

### Kontrollerar att vi inte har några -1 värden i target. (Då är det lectures)

In [14]:
train = train[train[target] != -1]

### Vi mergar user_answers och content_answers dataframes. 
### Det förekommer nullvärden efter våran merge, så vi fyller dom.

In [15]:
train = train.merge(user_answers_df, how='left', on='user_id')
train = train.merge(content_answers_df, how='left', on='content_id')
fmean = train.mean()
fmean['prior_question_had_explanation'] = round(fmean['prior_question_had_explanation'])
train = train.fillna(fmean)
train.isna().sum()

row_id                            0
timestamp                         0
user_id                           0
content_id                        0
content_type_id                   0
task_container_id                 0
user_answer                       0
answered_correctly                0
prior_question_elapsed_time       0
prior_question_had_explanation    0
part                              0
mean_user_accuracy                0
questions_answered                0
std_user_accuracy                 0
median_user_accuracy              0
skew_user_accuracy                0
mean_accuracy                     0
question_asked                    0
std_accuracy                      0
median_accuracy                   0
skew_accuracy                     0
dtype: int64

### Vi delar dataframen i den datan som ska bearbetas (x) och (y) som i sin tur ska motsvara förutsägelse av 'answered_correctly'

In [16]:
X = train[features] 
y = train[target] 

### Vi splittar vårat data för våra algoritmer

In [17]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)

In [18]:
from sklearn.linear_model import LogisticRegression

### Vi använder oss utav 'LogisticRegression', vi ger den datan att behandla
### Och skriver ut det förutsatta resultatet av 'answered_correctly'

In [19]:

classifier = LogisticRegression()
classifier.fit(X_train, y_train.values.ravel())
print('LogisticRegression:', classifier.score(X_test, y_test))

LogisticRegression: 0.7118416695965619


In [20]:
from sklearn.tree import DecisionTreeClassifier

### Vi använder oss utav 'DecisionTreeClassifier', vi ger den datan att behandla
### Och skriver ut det förutsatta resultatet av 'answered_correctly'

In [21]:
classifier = DecisionTreeClassifier()
classifier.fit(X_train, y_train.values.ravel())
print('Decision Tree Result:', classifier.score(X_test, y_test))

Decision Tree Result: 0.7061548281829757


In [22]:
from sklearn.svm import SVC

### Vi använder oss utav 'SVC', vi ger den datan att behandla
### Och skriver ut det förutsatta resultatet av 'answered_correctly'

In [None]:
classifier = SVC()
classifier.fit(X_train, y_train.values.ravel())
print('SVC Result:', classifier.score(X_test, y_test))

In [None]:
train.head()