## Data preprocessing for LightFM

In [1]:
import numpy as np
import pandas as pd

# all lightfm imports 
from lightfm.data import Dataset
from lightfm import LightFM
from lightfm import cross_validation
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score

# imports re for text cleaning 
import re
from datetime import datetime, timedelta

# we will ignore pandas warning 
import warnings
warnings.filterwarnings('ignore')

from main import *

from scipy.sparse import csr_matrix



In [2]:
# Read data, drop useless column 
df_fst_tests = pd.read_csv("data/init_tests.csv").drop("Unnamed: 0", axis= 'columns')
df_snd_tests = pd.read_csv("data/second_tests.csv").drop("Unnamed: 0", axis= 'columns')
df_personal_attributes = pd.read_csv("data/personal_attributes.csv").drop("Unnamed: 0", axis= 'columns')

In [3]:
# calculate the diff per skill, concat the dataframes 
df_all = df_personal_attributes
for col in df_fst_tests.columns[1:]:
    df_all[f'diff_{col}'] = df_snd_tests[col]- df_fst_tests[col]
df_all['action'] = df_snd_tests['action']

In [4]:
df_all

Unnamed: 0,student_id,gender,age,prior_education,course_of_studies,year_of_study,nationality,parent_occupation,previous_international_experience,languages,diff_openness,diff_cultural_empathy,diff_openmindness,diff_adaptability,diff_flexibility,diff_emotional_stability,diff_social_initiative,action
0,0,Male,18,Gymnasium,International business,0,Chinese,Doctor,Exchange student,"['English', 'Mandarin']",-0.009194,-0.041855,0.236599,0.472443,0.143547,-0.076754,-0.038781,Language courses
1,1,I would rather not say,20,VWO,International business,1,South-African,Doctor,Internship in a foreign country,"['Dutch', 'English']",0.340767,0.439234,0.545912,0.901928,0.401027,0.721604,0.356028,Extra-curricular Courses
2,2,I would rather not say,19,HAVO,International business,2,USA,Taxi driver,Gap year,['English'],0.005678,-0.013036,0.013779,0.062889,-0.154355,-0.154360,0.137602,Blogging
3,3,Female,24,HAVO,International business,3,Japanese,Doctor,Worked in a foreign country,"['English', 'Japanese']",0.145423,0.290494,0.443662,0.436655,0.172129,0.711872,0.322262,Extra-curricular Courses
4,4,I would rather not say,17,Gymnasium,International business,1,Belgium,,Internship in a foreign country,"['English', 'French']",0.532262,0.351150,0.285170,0.424398,0.974405,0.710348,0.408109,IVC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,I would rather not say,18,HAVO,International business,3,German,Gardener,Gap year,"['English', 'German']",0.162114,0.232295,0.066447,0.088428,0.680674,0.234471,0.296292,IVC
996,996,I would rather not say,19,Gymnasium,International business,0,Chinese,Life coach,Exchange student,"['English', 'Mandarin']",-0.078027,-0.069645,0.097114,0.207007,-0.041772,-0.200032,-0.108070,Language courses
997,997,I would rather not say,21,VWO,International business,1,Dutch,Doctor,Exchange student,"['Dutch', 'English', 'German']",0.111530,0.081480,0.138906,0.113138,-0.031199,0.117021,0.115254,CCCC
998,998,I would rather not say,20,HAVO,International business,2,Japanese,Taxi driver,Internship in a foreign country,"['English', 'Japanese']",0.389337,0.291267,0.233677,0.427105,0.349768,0.614151,0.300658,CCCC


## LightFM

In [6]:
model = LightFM(
    no_components=150,
    learning_rate=0.05,
    loss='warp',
    random_state=2019)