## Import important libraries

In [None]:
import riiideducation

In [2]:

import dask.dataframe as dd
import  pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn.preprocessing import RobustScaler

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import warnings
import gc
warnings.filterwarnings('ignore')
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")



## Read required Files

In [3]:

train= pd.read_csv('train.csv',
                usecols=[1, 2, 3,4,7,8,9], dtype={'timestamp': 'int64', 'user_id': 'int32' ,'content_id': 'int16','content_type_id': 'int8','answered_correctly':'int8',
                                                  'prior_question_elapsed_time': 'float32','prior_question_had_explanation': 'object'}
  
                  )

In [5]:

train = train[train.content_type_id == False]
train = train.sort_values(['timestamp'], ascending=True)

train.drop(['timestamp','content_type_id'], axis=1,   inplace=True)

train.head(3)

Unnamed: 0,user_id,content_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
0,115,5692,1,,
84775752,1805962620,5547,0,,
94914466,2015251289,4024,1,,


In [7]:
results_c = train[['content_id','answered_correctly']].groupby(['content_id']).agg(['mean','std','sum','skew'])
results_c.columns = ["content_mean","content_std","content_sum","content_skew"]

results_u = train[['user_id','answered_correctly']].groupby(['user_id']).agg(['mean', 'sum','std','skew'])
results_u.columns = ["user_mean", 'user_sum','user_std','user_skew']

In [30]:
#reading in question df
questions_df = pd.read_csv('questions.csv',
                            usecols=[0,1, 3,4],
                            dtype={'question_id': 'int16',
                              'part': 'int8','bundle_id': 'int8','tags': 'str'}
                          )

In [31]:
#如何把一串字符或数字的列切分开
tag = questions_df["tags"].str.split(" ", n = 5, expand = True) #转化为字符串，并用空格来切分，切成6份，并且转化为DF格式
tag.columns = ['tags1','tags2','tags3','tags4','tags5','tags6']
questions_df =  pd.concat([questions_df,tag],axis=1).drop(['tags'],axis=1)
questions_df['tags1'] = pd.to_numeric(questions_df['tags1'], errors='coerce',downcast='integer').fillna(-1)
questions_df['tags2'] = pd.to_numeric(questions_df['tags2'], errors='coerce',downcast='integer').fillna(-1)
questions_df['tags3'] = pd.to_numeric(questions_df['tags3'], errors='coerce',downcast='integer').fillna(-1)
#如果有需要把后面4-6tags也drop掉
#questions_df['tags4'] = pd.to_numeric(questions_df['tags4'], errors='coerce',downcast='integer').fillna(-1)
#questions_df['tags5'] = pd.to_numeric(questions_df['tags5'], errors='coerce',downcast='integer')
#questions_df['tags6'] = pd.to_numeric(questions_df['tags6'], errors='coerce',downcast='integer')

In [32]:
questions_df.head(3)

Unnamed: 0,question_id,bundle_id,part,tags1,tags2,tags3,tags4,tags5,tags6
0,0,0,1,51.0,131.0,162.0,38.0,,
1,1,1,1,131.0,36.0,81.0,,,
2,2,2,1,131.0,101.0,162.0,92.0,,


## Sample data

In [33]:
cat_columns = ['prior_question_had_explanation','bundle_id','part','tags1','tags2','tags3']

cont_columns = ['prior_question_elapsed_time', "content_mean","content_std","content_sum","content_skew",
                "user_mean", 'user_sum','user_std','user_skew']


In [34]:

X=train.iloc[88000000:,:]
X = pd.merge(X, results_u, on=['user_id'], how="left")
X = pd.merge(X, results_c, on=['content_id'], how="left")
X = pd.merge(X, questions_df, left_on = 'content_id', right_on = 'question_id', how = 'left')

X=X[X.answered_correctly!= -1 ]
X=X.sort_values(['user_id'])
X['prior_question_had_explanation']=X['prior_question_had_explanation'].fillna('False').map({"True":True,"False":False})
X['prior_question_elapsed_time'].fillna(0,inplace=True)

for col in cont_columns:
    X[col].fillna(X[col].mode(),inplace=True)

Y = X[["answered_correctly"]]
X = X.drop(["answered_correctly"], axis=1)

## Preprocessing
- label encoding
- Robust scaler

In [35]:

features=cat_columns+cont_columns

def encode(df,cols):
    enc =  {}
    for col in cols:
        print(col)
        lbencoder = LabelEncoder()
        lb = lbencoder.fit(df[col].values)
        df[col]=lb.transform(df[col].values)
        enc[col]=lb
        
    return df,enc

X,enc_dict = encode(X,cat_columns)

prior_question_had_explanation
bundle_id
part
tags1
tags2
tags3


In [39]:
scale_dict={}
fix_missing={}
for col in cont_columns:
    scaler = RobustScaler()
    scale_dict[col]=scaler.fit(X[col].values.reshape(-1,1))
    X[col] = scale_dict[col].transform(X[col].values.reshape(-1,1))
    fix_missing[col] = X[col].mode().values[0]

## Determining embedding dimension

In [40]:
cat_dims = [X[col].nunique() for col in cat_columns]
cat_embs = [(dim, min(50,(dim+1)//2)) for dim in cat_dims]

In [43]:
cat_embs

[(2, 1), (256, 50), (7, 4), (117, 50), (87, 44), (54, 27)]

## Dataset

In [44]:
class RidDataset(Dataset):
    def __init__(self, df,targets,cat_features,cont_features,mode='train'):
        self.mode = mode
        self.data_cont = df[cont_features].values
        self.data_cat = df[cat_features].values
        if mode=='train':
            self.targets = targets.values 
    
    def __len__(self):
        return len(self.data_cont)
    
    def __getitem__(self, idx):
        if self.mode == 'train':
            return torch.FloatTensor(self.data_cont[idx]),torch.LongTensor(self.data_cat[idx]),torch.FloatTensor(self.targets[idx])
        elif self.mode == 'test':
            return torch.FloatTensor(self.data_cont[idx]), torch.LongTensor(self.data_cat[idx]),0

## Model

In [45]:
class RidModel(nn.Module):
    def __init__(self,emb_dims,no_of_cont):
        super(RidModel, self).__init__()
        
        self.emb = nn.ModuleList([nn.Embedding(x,y) for x,y in emb_dims])
        
        no_of_embs = sum([y for x, y in emb_dims])
        self.no_of_embs = no_of_embs
        self.no_of_cont = no_of_cont
        
        
        self.batch_norm1 = nn.BatchNorm1d(self.no_of_cont)
        self.dropout1 = nn.Dropout(0.2)
        self.dense1 = nn.utils.weight_norm(nn.Linear(no_of_cont, 128))
        
        self.batch_norm2 = nn.BatchNorm1d(128+no_of_embs)
        self.dense2 = nn.utils.weight_norm(nn.Linear(128+no_of_embs, 32))
         
        self.batch_norm3 = nn.BatchNorm1d(32)
        self.dense3 = nn.utils.weight_norm(nn.Linear(32, 16))
        
        self.batch_norm4 = nn.BatchNorm1d(16)
        self.dense4 = nn.utils.weight_norm(nn.Linear(16, 1))
        
       
    def forward(self, cont,cat):
         
        ## cat data part
        x_cat = [emb_layer(cat[:,i]) for i,emb_layer in enumerate(self.emb)]
        x_cat = torch.cat(x_cat,1)#把所有类别变量的emb按行合并
        x_cat = self.dropout1(x_cat)
        ##cont data
        x = self.batch_norm1(cont)
        x = self.dropout1(x)
        x = F.relu(self.dense1(x))
        
        ##concat
        x = torch.cat([x,x_cat],1)#残差结构，把经过FC层的全部变量结果和类别变量embedding之后的结构合并
        
        ##rest of NN
        x = self.batch_norm2(x)
        x = F.relu(self.dense2(x))
        
        x = self.batch_norm3(x)
        x = F.relu(self.dense3(x))
        
        
        x = self.batch_norm4(x)
        x = F.sigmoid(self.dense4(x))
        
        return x

In [46]:
X_train,X_valid,y_train,y_valid = train_test_split(X[features],Y,test_size=0.15)

In [47]:
del X,Y,train
gc.collect()

76301

In [48]:
assert X_train.shape[0]==y_train.shape[0]
assert X_valid.shape[0]==y_valid.shape[0]


## Train

In [49]:
nepochs=5
train_set = RidDataset(X_train,y_train,cat_columns,cont_columns,mode="train")
valid_set = RidDataset(X_valid,y_valid,cat_columns,cont_columns,mode="train")
val_auc=[]
dataloaders = {'train':DataLoader(train_set,batch_size=2**15,shuffle=True),
              "val":DataLoader(valid_set,batch_size=2**15,shuffle=True)}

model = RidModel(cat_embs,len(cont_columns)).to(DEVICE)
checkpoint_path = 'rid_model.pt'
optimizer = optim.Adam(model.parameters())
#如果几次内没有下降loss，自动减少学习率
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, eps=1e-4, verbose=True) 
criterion = nn.BCELoss()
best_loss = {'train':np.inf,'val':np.inf}
auc_score = {'train':0,'val':0.0}

for epoch in range(nepochs):
            epoch_loss = {'train': 0.0, 'val': 0.0}
            
            for phase in ['train', 'val']:
                if phase == 'train':
                    model.train()
                else:
                    model.eval()
                
                running_loss = 0.0
                auc=0.0
                
                for i,(x,y,z) in enumerate(dataloaders[phase]):
                    x, y, z = x.to(DEVICE), y.to(DEVICE),z.to(DEVICE)
                    optimizer.zero_grad()
                    
                    with torch.set_grad_enabled(phase=='train'):
                        preds = model(x,y)
                        loss = criterion(preds, z)
                        auc = roc_auc_score(z.detach().cpu().numpy(),preds.detach().cpu().numpy())
                        
                        if phase=='train':
                            loss.backward()
                            optimizer.step()
                    
                    running_loss += loss.item() / len(dataloaders[phase])
                    auc += auc/len(dataloaders[phase])
                
                epoch_loss[phase] = running_loss
                auc_score[phase]=auc
                
            print("Epoch {}/{}   - loss: {:5.5f}   - val_loss: {:5.5f} -- AUC {:5.4f} --val AUC {:5.4f}".format(epoch+1,
                    nepochs, epoch_loss['train'], epoch_loss['val'],auc_score['train'],auc_score['val']))
            val_auc.append(auc_score['val'])
            scheduler.step(epoch_loss['val'])
            
            if epoch_loss['val'] < best_loss['val']:
                best_loss = epoch_loss
                torch.save(model.state_dict(), checkpoint_path)
                
 



Epoch 1/5   - loss: 0.56193   - val_loss: 0.53608 -- AUC 0.7519 --val AUC 0.7654
Epoch 2/5   - loss: 0.53575   - val_loss: 0.53475 -- AUC 0.7567 --val AUC 0.7733
Epoch 3/5   - loss: 0.53492   - val_loss: 0.53466 -- AUC 0.7614 --val AUC 0.7666
Epoch 4/5   - loss: 0.53452   - val_loss: 0.53488 -- AUC 0.7555 --val AUC 0.7779
Epoch 5/5   - loss: 0.53423   - val_loss: 0.53440 -- AUC 0.7526 --val AUC 0.7721


In [50]:
print(f'Final validation AUC Score {np.mean(val_auc):5.4f}')

Final validation AUC Score 0.7711
