## Data preprocessing for LightFM

In this notebook we will create and test a recommendation engine which will recommend courses based upon how mu

In [1]:
import numpy as np
import pandas as pd

# We will ignore pandas warning 
import warnings
warnings.filterwarnings('ignore')

# Import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

# Import the recommendation model
from python_recommender_files.recommender import recommend_course

In [2]:
# Read data, drop useless columns
df_fst_tests = pd.read_csv("csv/init_tests.csv").drop("Unnamed: 0", axis= 'columns')
df_snd_tests = pd.read_csv("csv/second_tests.csv").drop("Unnamed: 0", axis= 'columns')
df_personal_attributes = pd.read_csv("csv/personal_attributes.csv").drop("Unnamed: 0", axis= 'columns')

In [3]:
# Create the user features
df_user_features = df_personal_attributes
for col in df_fst_tests.columns[1:]:
    df_user_features[col] = df_fst_tests[col]
df_user_features['course'] = df_snd_tests['course']

In [5]:
# Create the item features
item_features = df_snd_tests
item_features.drop('course', axis=1, inplace=True)

In [6]:
# Create the dummies variables so we can use our data for linear regression
user_features = pd.get_dummies(df_user_features)

# Linear Regression

In [7]:
# To begin training our linear regression model, we first need to sepperate it into train and test data

x_train, x_test, y_train, y_test= train_test_split(user_features.values.tolist(), item_features.values.tolist(), random_state =16)

In [8]:
# Make and train the model
linreg = LinearRegression()
linreg.fit(x_train, y_train)

LinearRegression()

In [9]:
# Test our model to see how good it is
model = linreg.predict(x_test)
RMSE = np.sqrt(mean_squared_error(model, y_test))
score = linreg.score(x_test, y_test)
print("Mean squared error: "+str(RMSE))
print("Score: "+str(score))

Mean squared error: 0.21828550733818408
Score: 0.8497538795267328


In [10]:
linreg = linreg
unique_courses = df_user_features['course'].unique()
row = user_features.loc[1]
skills = ['openness', 'cultural_empathy', 'openmindness', 'adaptability',  'flexibility', 'emotional_stability', 
          'social_initiative']

recommend_course(linreg, unique_courses, row, skills)

['Basics of psychology', 'Basics of psychology']