## Data preprocessing for LightFM

In this notebook we will create and test a recommendation engine which will recommend actions based upon how mu

In [128]:
import numpy as np
import pandas as pd

# we will ignore pandas warning 
import warnings
warnings.filterwarnings('ignore')


# Import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [129]:
# Read data, drop useless columns
df_fst_tests = pd.read_csv("csv/init_tests.csv").drop("Unnamed: 0", axis= 'columns')
df_snd_tests = pd.read_csv("csv/second_tests.csv").drop("Unnamed: 0", axis= 'columns')
df_personal_attributes = pd.read_csv("csv/personal_attributes.csv").drop("Unnamed: 0", axis= 'columns')

In [130]:
# Create the user features
user_features = df_personal_attributes
for col in df_fst_tests.columns[1:]:
    user_features[col] = df_fst_tests[col]
user_features['action'] = df_snd_tests['action']

In [131]:
# Create the item features
item_features = df_snd_tests
item_features.drop('action', axis=1, inplace=True)

In [132]:
# Create the dummies variables so we can use our data for linear regression
user_features = pd.get_dummies(user_features)

# Linear Regression

In [133]:
# To begin training our linear regression model, we first need to sepperate it into train and test data

x_train, x_test, y_train, y_test= train_test_split(user_features.values.tolist(), item_features.values.tolist(), random_state =16)

In [134]:
# Make and train the model
linreg = LinearRegression()
linreg.fit(x_train, y_train)

LinearRegression()

In [175]:
# Test our model to see how good it is
model = linreg.predict(x_test)
RMSE = np.sqrt(mean_squared_error(model, y_test))
score = linreg.score(x_test, y_test)
print("RMSE: "+str(RMSE))
print("Score: "+str(score))

RMSE: 0.1976129819412988
Score: 0.8801846938979734


In [176]:
def create_actions(row, unique_actions):
    """Create actions for the user. But it will be erased if he already has one."""
    for col in unique_actions:
        row[f'action_{col}'] = 0
    return row

In [177]:
def predict_scores(row, unique_actions, linreg):
    """Predict how good each action will score for the user."""
    all_actions = []
    for col in unique_actions:
        row[f'action_{col}'] = 1
        pred = linreg.predict([row.values.tolist()])
        all_actions.append(pred)
        row[f'action_{col}'] = 0
    return all_actions

In [182]:
def find_best_actions(all_actions, unique_actions):
    """Find the best actions the user can take in order to have the biggest predicted growth."""
    best_scores = [sum(scores[0]) for scores in all_actions]
    indices = unique_actions[np.argmax(best_scores)]
    return indices

In [183]:
def worst_skill_index(row, skills):
    """Find the index of the worst skill of the user."""
    skills_measure = []
    for col in skills:
        skills_measure.append(row[col])
    return np.argmin(skills_measure)

In [184]:
def best_action_for_worst_skill(row, skills, effect_actions, unique_actions):
    """Find the action that will help the users worst skill to improve most."""
    skill_index = worst_skill_index(row, skills)
    worst_skill_score = row[skills[skill_index]]
    
    best_improvement = []
    for action in effect_actions:
        best_improvement.append(action[0][skill_index] - worst_skill_score)
    
    return unique_actions[np.argmax(best_improvement)]

In [187]:
def recommend_action(linreg, unique_actions, row, skills):
    """The recommendation model. It will return the action a user can take to improve the most overall or the action which will
    improve the users worst skill most."""
    row = create_actions(row, unique_actions)  # Create empty actions
    effect_actions = predict_scores(row, unique_actions, linreg)  # Predict how much influence every action will have on the user
    
    most_effective_actions = find_best_actions(effect_actions, unique_actions)
    
    worst_skill_improvement_action = best_action_for_worst_skill(row, skills, effect_actions, unique_actions)
    
    return [most_effective_actions, worst_skill_improvement_action]

In [188]:
linreg = linreg
unique_actions = df['action'].unique()
row = user_features.loc[1]
skills = ['openness', 'cultural_empathy', 'openmindness', 'adaptability',  'flexibility', 'emotional_stability', 
          'social_initiative']

recommend_action(linreg, unique_actions, row, skills)

['Extra-curricular Courses', 'Extra-curricular Courses']