# Collaborative filtering recommendations

## Preparation

In [1]:
import pandas as pd
import sklearn as sk
import json

In [2]:
#Import experiences
df_exp = pd.read_csv(r"Files/Student_experiences_sector.csv", sep =';', engine ='python', encoding='utf-8', error_bad_lines=False)

In [3]:
df_exp.head()
df_exp_map = df_exp[df_exp.columns[[0, 1, 4]]]
df_exp_map.head()

Unnamed: 0,Student ID,Activiteit,Beoordeling
0,162450,Administratie,3
1,162458,Administratie,4
2,162431,Agrarisch,3
3,162462,Analist,3
4,162469,Analist,3


In [4]:
df_exp_map.sort_values('Student ID')

Unnamed: 0,Student ID,Activiteit,Beoordeling
47,162426,Management,2
86,162426,Verpleegkunde,2
53,162426,Mediavormgeving,4
84,162429,Trainer,4
73,162429,Sportopleiding,4
...,...,...,...
64,199461,Scheepvaart,4
34,199462,Horeca,3
52,199465,Marketing & Communicatie,5
26,199468,Handhaver,4


In [5]:
df_exp_map.index.is_unique

True

In [6]:
item_ratings_table = df_exp_map.reset_index().pivot_table(index='Student ID', columns='Activiteit', values='Beoordeling ')

# Inspect the transformed table
item_ratings_table.head()

Activiteit,Administratie,Agrarisch,Analist,Architect,Atoomwetenschapper,Basisonderwijs,Chefkok,Docent,Elektrotechniek,Financieel,...,Sales,Scheepvaart,Schoonheidsspecialist,Sport & Business,Sportopleiding,Tandarts,Techniek,Toerisme,Trainer,Verpleegkunde
Student ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
162426,,,,,,,,,,,...,,,,,,,,,,2.0
162429,,,,,,,,,,,...,,,,,4.0,,,,4.0,
162431,,3.0,,,,,,,,,...,,,,,4.0,,,,,
162437,,,,,,,,2.0,,,...,,,,,,,,3.0,,5.0
162438,,,,,,,,,,,...,,,,,,,,,,


In [7]:
# Get the average rating for each user 
avg_ratings = item_ratings_table.mean(axis=1)

# Center each users ratings around 0
item_ratings_table_centered = item_ratings_table.sub(avg_ratings, axis=0)

# Fill in the missing data with 0s
item_ratings_table_normed = item_ratings_table_centered.fillna(0)

In [8]:
item_ratings_table_normed.head()

Activiteit,Administratie,Agrarisch,Analist,Architect,Atoomwetenschapper,Basisonderwijs,Chefkok,Docent,Elektrotechniek,Financieel,...,Sales,Scheepvaart,Schoonheidsspecialist,Sport & Business,Sportopleiding,Tandarts,Techniek,Toerisme,Trainer,Verpleegkunde
Student ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
162426,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.666667
162429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162431,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
162437,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.5,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.5,0.0,1.5
162438,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Item based

In [9]:
item_ratings_subset = item_ratings_table

item_ratings_table = item_ratings_subset.T

# Get the average rating for each user 
avg_ratings = item_ratings_table.mean(axis=1)

# Center each users ratings around 0
item_ratings_table_centered = item_ratings_table.sub(avg_ratings, axis=0)

# Fill in the missing data with 0s
item_ratings_table_normed = item_ratings_table_centered.fillna(0)

In [10]:
item_ratings_table_normed.head(10)

Student ID,162426,162429,162431,162437,162438,162439,162441,162442,162443,162444,...,188495,199416,199423,199455,199459,199460,199461,199462,199465,199468
Activiteit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Administratie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Agrarisch,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Analist,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Architect,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Atoomwetenschapper,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Basisonderwijs,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Chefkok,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Docent,0.0,0.0,0.0,-0.5,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0
Elektrotechniek,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Financieel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
from sklearn.metrics.pairwise import cosine_similarity

# Assign the arrays to variables
item_1 = item_ratings_table_normed.loc['Basisonderwijs', :].values.reshape(1, -1)
item_2 = item_ratings_table_normed.loc['Verpleegkunde', :].values.reshape(1, -1)

# Find the similarity between two elements
similarity_A = cosine_similarity(item_1, item_2)
print(similarity_A)

[[0.15894353]]


In [12]:
from sklearn.metrics.pairwise import cosine_similarity

# Assign the arrays to variables
item_3 = item_ratings_table_normed.loc['Mediavormgeving', :].values.reshape(1, -1)
item_4 = item_ratings_table_normed.loc['Management', :].values.reshape(1, -1)

# Find the similarity between two elements
similarity_A = cosine_similarity(item_3, item_4)
print(similarity_A)

[[-0.36927447]]


Due to the less extensive dataset, the numbers will often return a low value. It it expected that the similarity scores will become more correct, when more experiences are appended to the dataset. 

In [13]:
from sklearn.metrics.pairwise import cosine_similarity

# Generate the similarity matrix
similarities = cosine_similarity(item_ratings_table_normed)

# Wrap the similarities in a DataFrame
cosine_similarity_df = pd.DataFrame(similarities, index=item_ratings_table_normed.index, columns=item_ratings_table_normed.index)

# Find the similarity values for a specific movie
cosine_similarity_series = cosine_similarity_df.loc['Verpleegkunde']

# Sort these values highest to lowest
ordered_similarities = cosine_similarity_series.sort_values(ascending=False)

print(ordered_similarities)

Activiteit
Verpleegkunde               1.000000
Management                  0.220575
Horeca                      0.181221
Havo                        0.169732
Basisonderwijs              0.158944
Scheepvaart                 0.000000
Marketing & Communicatie    0.000000
Paarden                     0.000000
Sales                       0.000000
Sport & Business            0.000000
Kweker                      0.000000
Sportopleiding              0.000000
Tandarts                    0.000000
Techniek                    0.000000
Toerisme                    0.000000
Trainer                     0.000000
Maatschappelijk zorg        0.000000
Administratie               0.000000
Agrarisch                   0.000000
Fotografie                  0.000000
Analist                     0.000000
Architect                   0.000000
Atoomwetenschapper          0.000000
Chefkok                     0.000000
Elektrotechniek             0.000000
Financieel                  0.000000
Games                      

## User-User based

In [14]:
user_ratings_subset = item_ratings_subset
avg_user_ratings = user_ratings_subset.mean(axis=1)
user_ratings_table_centered = user_ratings_subset.sub(avg_user_ratings, axis=0)
user_ratings_table_normed = user_ratings_table_centered.fillna(0)
user_ratings_table_normed.head()

Activiteit,Administratie,Agrarisch,Analist,Architect,Atoomwetenschapper,Basisonderwijs,Chefkok,Docent,Elektrotechniek,Financieel,...,Sales,Scheepvaart,Schoonheidsspecialist,Sport & Business,Sportopleiding,Tandarts,Techniek,Toerisme,Trainer,Verpleegkunde
Student ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
162426,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.666667
162429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162431,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
162437,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.5,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.5,0.0,1.5
162438,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
similarities_user = cosine_similarity(user_ratings_table_normed)
user_similarities = pd.DataFrame(similarities_user, index= user_ratings_table_normed.index, columns=user_ratings_table_normed.index)
user_similarities.head(10)

Student ID,162426,162429,162431,162437,162438,162439,162441,162442,162443,162444,...,188495,199416,199423,199455,199459,199460,199461,199462,199465,199468
Student ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
162426,1.0,0.0,0.0,-0.273861,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.57735,0.0,0.0,0.0,0.288675
162429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162431,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162437,-0.273861,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.474342
162438,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162439,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162441,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162442,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.436436,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.436436
162443,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.436436,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5
162444,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
user_similarity_series = user_similarities.loc[162437]
ordered_similarities = user_similarity_series.sort_values(ascending=False)

# Find the top 10 most similar users
nearest_neighbors = ordered_similarities[1:11].index

# Extract the ratings of the neighbors
neighbor_ratings = user_ratings_table_normed.reindex(nearest_neighbors)

# Calculate the mean rating given by the users nearest neighbors
print(neighbor_ratings['Mediavormgeving'].mean())

-0.033333333333333305


In [17]:
# Drop the column you are trying to predict
users_to_ratings = user_ratings_table_normed
user_ratings_table = user_ratings_subset
users_to_ratings.drop("Mediavormgeving", axis=1, inplace=True)

# Get the data for the user you are predicting for
target_user_x = users_to_ratings.loc[[162437]]

# Get the target data from user_ratings_table
other_users_y = user_ratings_table["Mediavormgeving"]

# Get the data for only those that have seen the movie
other_users_x = users_to_ratings[other_users_y.notnull()]

# Remove those that have not seen the movie from the target
other_users_y.dropna(inplace=True)
print(other_users_y)

Student ID
162426    4.0
162445    3.0
162464    4.0
199460    2.0
Name: Mediavormgeving, dtype: float64


In [18]:
from sklearn.neighbors import KNeighborsRegressor

# Instantiate the user KNN model
user_knn = KNeighborsRegressor(metric='cosine', n_neighbors=3)

# Fit the model and predict the target user
user_knn.fit(other_users_x, other_users_y)
user_user_pred = user_knn.predict(target_user_x)

print(user_user_pred)

[3.]


### With user portfolio data

In [19]:
# Import the User portfolio

df_port = pd.read_csv(r"Files/portfolio_data_anonymized.csv", sep =',', engine ='python', encoding='utf-8', error_bad_lines=False)
df_port.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 24 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   Student ID                                 50 non-null     int64  
 1   School ID                                  50 non-null     int64  
 2   Regio                                      50 non-null     int64  
 3   Leeftijd                                   50 non-null     int64  
 4   Leerjaar                                   50 non-null     int64  
 5   Leerweg                                    50 non-null     int64  
 6   Foto                                       50 non-null     int64  
 7   Plaatje                                    50 non-null     int64  
 8   Video                                      50 non-null     int64  
 9   Mijn vervolgstap / mijn vervolgkeuze is.1  47 non-null     object 
 10  Over 5 jaar zou ik het liefs

In [20]:
df_port = df_port[df_port.columns[[0, 1, 2, 3, 4, 5, 6, 7, 8, 20, 21, 22, 23]]]
df_port = df_port.reset_index().pivot_table(index='Student ID')
df_port.head()

Unnamed: 0_level_0,Foto,Leeftijd,Leerjaar,Leerweg,Plaatje,Regio,School ID,Video,avg_word,char_count,index,sentiment_score,word_count
Student ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
162426,0,15,4,3,0,7,145,1,3.956522,113,0,0,23
162429,0,16,4,3,0,7,145,1,3.75,94,1,0,20
162431,0,15,4,3,0,7,145,1,3.8625,388,2,2,80
162437,0,15,4,3,0,7,145,1,3.951613,306,3,1,62
162438,0,15,4,3,0,7,145,1,4.357143,75,4,0,15


In [23]:
user_ratings_subset = item_ratings_subset
avg_user_ratings = user_ratings_subset.mean(axis=1) # prev. user_ratings_subset
user_ratings_table_centered = user_ratings_subset.sub(avg_user_ratings, axis=0)
user_ratings_table_normed = user_ratings_table_centered.fillna(0)
user_ratings_table_normed.head()

Activiteit,Administratie,Agrarisch,Analist,Architect,Atoomwetenschapper,Basisonderwijs,Chefkok,Docent,Elektrotechniek,Financieel,...,Sales,Scheepvaart,Schoonheidsspecialist,Sport & Business,Sportopleiding,Tandarts,Techniek,Toerisme,Trainer,Verpleegkunde
Student ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
162426,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.666667
162429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162431,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
162437,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.5,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.5,0.0,1.5
162438,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
df_merged = pd.merge(df_port, user_ratings_table_normed, on="Student ID")
df_merged.head()

Unnamed: 0_level_0,Foto,Leeftijd,Leerjaar,Leerweg,Plaatje,Regio,School ID,Video,avg_word,char_count,...,Sales,Scheepvaart,Schoonheidsspecialist,Sport & Business,Sportopleiding,Tandarts,Techniek,Toerisme,Trainer,Verpleegkunde
Student ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
162426,0,15,4,3,0,7,145,1,3.956522,113,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.666667
162429,0,16,4,3,0,7,145,1,3.75,94,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162431,0,15,4,3,0,7,145,1,3.8625,388,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
162437,0,15,4,3,0,7,145,1,3.951613,306,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.5,0.0,1.5
162438,0,15,4,3,0,7,145,1,4.357143,75,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
similarities_user_2 = cosine_similarity(df_merged)
user_similarities_2 = pd.DataFrame(similarities_user_2, index= df_merged.index, columns=df_merged.index)
user_similarities_2.head(5)

Student ID,162426,162429,162431,162437,162438,162439,162441,162442,162443,162444,...,188495,199416,199423,199455,199459,199460,199461,199462,199465,199468
Student ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
162426,1.0,0.996163,0.852838,0.893747,0.982431,0.897,0.858686,0.853919,0.997141,0.96784,...,0.833379,0.922031,0.961938,0.807932,0.878569,0.94857,0.955852,0.952414,0.88451,0.935463
162429,0.996163,1.0,0.804481,0.85164,0.994863,0.855484,0.81123,0.805739,0.99862,0.98572,...,0.783984,0.941002,0.949287,0.755259,0.837222,0.952664,0.947592,0.938892,0.84521,0.940991
162431,0.852838,0.804481,1.0,0.996403,0.741543,0.995672,0.999867,0.999689,0.821412,0.697879,...,0.994176,0.664036,0.885474,0.992305,0.985299,0.776131,0.855063,0.883906,0.97906,0.758028
162437,0.893747,0.85164,0.996403,1.0,0.795481,0.999897,0.997287,0.996437,0.866478,0.755772,...,0.988769,0.720476,0.916773,0.983061,0.988654,0.821023,0.889885,0.91401,0.984373,0.803392
162438,0.982431,0.994863,0.741543,0.795481,1.0,0.800062,0.74936,0.743361,0.991144,0.997537,...,0.721297,0.957075,0.928339,0.689146,0.783777,0.951693,0.932397,0.917463,0.794265,0.942035


In [26]:
user_similarity_series_2 = user_similarities_2.loc[162426]
ordered_similarities_2 = user_similarity_series_2.sort_values(ascending=False)

# Find the top 10 most similar users
nearest_neighbors_2 = ordered_similarities_2[1:11].index

# Extract the ratings of the neighbors
neighbor_ratings_2 = user_ratings_table_normed.reindex(nearest_neighbors_2)

# Calculate the mean rating given by the users nearest neighbors
print(neighbor_ratings_2['Mediavormgeving'].mean())

-0.033333333333333305


In [27]:
from sklearn.neighbors import KNeighborsRegressor

# Drop the column you are trying to predict
users_to_ratings_2 = user_ratings_table_normed
user_ratings_table_2 = user_ratings_subset
users_to_ratings_2.drop("Verpleegkunde", axis=1, inplace=True)

# Get the data for the user you are predicting for
target_user_x_2 = users_to_ratings_2.loc[[162437]]

# Get the target data from user_ratings_table
other_users_y_2 = user_ratings_table_2["Verpleegkunde"]

# Get the data for only those that have seen the movie
other_users_x_2 = users_to_ratings_2[other_users_y_2.notnull()]

# Remove those that have not seen the movie from the target
other_users_y_2.dropna(inplace=True)

# Instantiate the user KNN model
user_knn_2 = KNeighborsRegressor(metric='cosine', n_neighbors=3)

# Fit the model and predict the target user
user_knn_2.fit(other_users_x_2, other_users_y_2)
user_user_pred_2 = user_knn_2.predict(target_user_x_2)

print(user_user_pred_2)

[3.83333333]


In [29]:
# Drop the column you are trying to predict
users_to_ratings_2 = user_ratings_table_normed
user_ratings_table_2 = user_ratings_subset
users_to_ratings_2.drop("Mediavormgeving", axis=1, inplace=True)

# Get the data for the user you are predicting for
target_user_x_2 = users_to_ratings_2.loc[[162437]]

# Get the target data from user_ratings_table
other_users_y_2 = user_ratings_table_2["Mediavormgeving"]

# Get the data for only those that have seen the movie
other_users_x_2 = users_to_ratings_2[other_users_y_2.notnull()]

# Remove those that have not seen the movie from the target
other_users_y_2.dropna(inplace=True)
print(other_users_y_2)

Student ID
162426    4.0
162445    3.0
162464    4.0
199460    2.0
Name: Mediavormgeving, dtype: float64


In [30]:
from sklearn.neighbors import KNeighborsRegressor

# Instantiate the user KNN model
user_knn_2 = KNeighborsRegressor(metric='cosine', n_neighbors=3)

# Fit the model and predict the target user
user_knn_2.fit(other_users_x_2, other_users_y_2)
user_user_pred_2 = user_knn_2.predict(target_user_x_2)

print(user_user_pred_2)

[3.66666667]
