In [7]:
# I import all the packages I need
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
import tkinter as tk
from tkinter import messagebox
from tkinter import ttk

In [23]:
# I upload the dataset related to the ratings that contains the rating (out of 5) by each user for some movies

ratings= pd.read_csv('https://raw.githubusercontent.com/ValentinaCoppi/IT-coding-project/main/ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [24]:
# I upload the dataset that contains the title of the movies and their genre

movies= pd.read_csv('https://raw.githubusercontent.com/ValentinaCoppi/IT-coding-project/main/movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


My aim is to create a collaborative filtering based on the cosine distance between the users, in order to fill in the gaps of the rating for the unseen movies, and propose the user the best movies he may like.

In [10]:
# I calculate the mean rating for each user
def meanbyuser(user):
    return ratings.groupby('userId')[['rating']].mean().rating[user]

In [11]:
mean_rating= ratings.groupby('userId')[['rating']].mean()

In [12]:
#I change the names at the original dataset and the userId column

x = ratings.copy()
y = ratings['userId']


In [13]:
# I need to transform the training dataset in a matrix that has users has row index and movie as column index, and fill it in with the ratings

df_ratings = x.pivot(index='userId', columns= 'movieId', values='rating')

In [14]:
# I have a look at the dataset
df_ratings

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,...,,,,,,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,


In [15]:
# I create a copy of the dataset, in order to modify it without causing problems
df_ratings0= df_ratings.copy()

In [16]:
# I want to normalize all the ratings, subtracting the mean value for each user, in order to make the ratings more fair

for i in range(1,len(mean_rating['rating']+1)):
    df_ratings0.loc[i,:]=np.array(df_ratings0.loc[i,:])-mean_rating['rating'][i] 


In [17]:
# This is the new dataframe with the normalized ratings
df_ratings0

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.366379,,-0.366379,,,-0.366379,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,0.363636,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,-1.157399,,,,,,-1.157399,,,,...,,,,,,,,,,
607,0.213904,,,,,,,,,,...,,,,,,,,,,
608,-0.634176,-1.134176,-1.134176,,,,,,,0.865824,...,,,,,,,,,,
609,-0.270270,,,,,,,,,0.729730,...,,,,,,,,,,


In [18]:
# I want that NaN are filled with 0, or I can't work on the matrix
# For doing that I create a copy of df_ratings0 

df_ratings1 = df_ratings0.copy().fillna(0)
df_ratings1.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.366379,0.0,-0.366379,0.0,0.0,-0.366379,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.363636,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
# I calculate the cosine similarity of the users and I create a dataframe that contains it

similarity_matrix = cosine_similarity(df_ratings1, df_ratings1)
similarity_matrix_df = pd.DataFrame(similarity_matrix, index=df_ratings.index, columns=df_ratings.index)

In [20]:
# Let's have a look at the similarity matrix
similarity_matrix_df.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.001265,0.000553,0.048419,0.021847,-0.045497,-0.0062,0.047013,0.01951,-0.008754,...,0.018127,-0.017172,-0.015221,-0.037059,-0.029121,0.012016,0.055261,0.075224,-0.025713,0.019612
2,0.001265,1.0,0.0,-0.017164,0.021796,-0.021051,-0.011114,-0.048085,0.0,0.003012,...,-0.050551,-0.031581,-0.001688,0.0,0.0,0.006226,-0.020504,-0.006001,-0.060091,-0.004277
3,0.000553,0.0,1.0,-0.01126,-0.031539,0.0048,0.0,-0.032471,0.0,0.0,...,-0.004904,-0.016117,0.017749,0.0,-0.001431,-0.037289,-0.007789,-0.013001,0.0,0.011649
4,0.048419,-0.017164,-0.01126,1.0,-0.02962,0.013956,0.058091,0.002065,-0.005874,0.05159,...,-0.037687,0.063122,0.02764,-0.013782,0.040037,0.02059,0.014628,-0.037569,-0.017884,0.003355
5,0.021847,0.021796,-0.031539,-0.02962,1.0,0.009111,0.010117,-0.012284,0.0,-0.033165,...,0.015964,0.012427,0.027076,0.012461,-0.036272,0.026319,0.031896,-0.001751,0.093829,-0.003841


Now, all the main elements have been created, so I can write the main function of the program, the one that actually recommand the movies.
The recommandation system is user based, so the funtion requires to isert one of the UserId and will give back both the liste of the movies seen by the User chosen and the first 20 movies per possible rating, basing on similar users ratings.

In [21]:
def Recommand(picked_userid):   
    # I remove picked UserId from the candidate list in the similarity matrix previously created
    similarity_matrix_df.drop(index=picked_userid, inplace=False)
   
    # I decide that the number of similar user that I want consider is 10
    n = 10
    # Anyway I don't want users that are not similar at all, so I put a User similarity threashold
    user_similarity_threshold = 0.09
    # Here, I get top n similar users that exceed the threshold
    similar_users = similarity_matrix_df[similarity_matrix_df[picked_userid]>user_similarity_threshold][picked_userid].sort_values(ascending=False)[:n]

    #I make a dataframe with the movies that the target user has watched (I actually make two, one with the normalized data and one with the real one)
    picked_userid_watched = df_ratings0[df_ratings0.index == picked_userid].dropna(axis=1, how='all')
    picked_userid_watched1 = df_ratings[df_ratings.index == picked_userid].dropna(axis=1, how='all')
    #I make a dataframe with the movies that similar users watched and I remove movies that none of the similar users have watched (drop the columns with all NA)
    similar_user_movies = df_ratings0[df_ratings0.index.isin(similar_users.index)].dropna(axis=1, how='all')
    #Now, I remove the watched movie by the chose User from the movie list. 
    #In this way I got a dataframe with all the movies that the picked user did't watch, but all the similar users did 
    similar_user_movies.drop(picked_userid_watched.columns,axis=1, inplace=True, errors='ignore')
    
    #I create a dictionary to store item scores
    item_score = {}
    #I loop through movies
    for i in similar_user_movies.columns:
      # I get the ratings for movie i
        movie_rating = similar_user_movies[i]
      # I create a variable to store the score
        total = 0
      # I create a variable to store the number of scores
        count = 0
      # I loop through similar users
        for u in similar_users.index:
        # If the movie has rating
            if pd.isna(movie_rating[u]) == False:
          # Score is the sum of user similarity score multiply by the movie rating, becuase I want that more similar users weight more in the movies score
              score = similar_users[u] * movie_rating[u]
          # I add the score to the total score for the movie so far
              total += score
          # I increase the count
              count +=1
      # Now, I get the average score for the item and I add back the mean score for the user picked
        item_score[i] = total / count+ meanbyuser(picked_userid)
    # I convert the dictionary to pandas dataframe and I format the avarage rate with 2 decimal digits
    item_score = pd.DataFrame(item_score.items(), columns=['movieId', 'movie_score'])
    item_score['movie_score'] = item_score['movie_score'].apply(lambda x: float("{:.2f}".format(x)))
    # I sort the movies by score
    ranked_item_score = item_score.sort_values(by='movie_score', ascending=False)
    # I create a dataframe with the movieId, the estimated rate, the movie title, and the movie genre that is the recommandation datatset
    best_movies= pd.merge( ranked_item_score, movies, on= 'movieId')
    # I creeate a dataframe with movieId, rate, movie title and genre, that is the the datafram with the movies watched
    picked_userid_watched1= pd.merge(picked_userid_watched1.T, movies, on= 'movieId')
    # I sort the previous dataset by rate
    picked_userid_watched2=picked_userid_watched1.sort_values(by=[picked_userid], ascending=False)
    # I rename one of the columns because it create some problems in the exection, since it wasn't a string
    picked_userid_watched2.rename(columns = {picked_userid:'Rate'}, inplace = True)
    # I select top m movies: I decided to recommand just 20 movies
    m = 20
    # I write the first 20 recommanded movies in an excel file and even the first 20 watche dmovies by rate in another excel file (I decided to take 20 even in this case, just to get the preferred by the user)
    best_movies.head(m).to_excel("output.xlsx")
    picked_userid_watched2.head(m).to_excel('watched.xlsx')
    # I ask to return them in order to have a look when I just call the function
    return best_movies.head(m)
    return picked_userid_watched2.head(m)

I decided to create a simple GUI in which you can insert the UserId and it has two buttons: one have a look at the movies the user liked the most (in order that you can make an idea of the user preferences) and the other shows you the recommanded movies for that User.
If you insert a non valid userId a messagebox i shown, telling you that your userId is invalid, if you close it you can try again. 
The dataframe are shown in toplevel windows that can be closed and you ca start from the beginning without exectuing the code again.


In [22]:
# I create the main window with a title,a size and a beckground color
window = tk.Tk()
window.title("Choose UserId")
window.geometry('600x400')
window.configure(bg='#333333')

# I define the function that checks if the userId is valid or not. If it is valid it open the recommand movies window, if it's not it opens the error messagebox
def login():
    username = range(1,len(df_ratings)+1)
    try:
        if int(username_entry.get()) in username:
            open_secondary_window()
    except:
        messagebox.showinfo(title="Error", message="Invalid UserId.")
        
 # I define the function that checks if the userId is valid or not. If it is valid it open the watched movies window, if it's not it opens the error messagebox      
def look():
    username = range(1,len(df_ratings)+1)
    try:
        if int(username_entry.get()) in username:
            open_third_window()
    except:
        messagebox.showinfo(title="Error", message="Invalid UserId.")

# I create a frame with a beckground color

frame = tk.Frame(bg='#333333')

# This is the funtion that creates the secondary window with the recommandations    
def open_secondary_window():
    # Create secondary window with a title and a beckground color
    secondary_window = tk.Toplevel()
    secondary_window.title("Reccomanded movies")
    secondary_window.config(bg='#333333')
    
    # I call the Recommand function with argument the UserId I entered in the main window
    Recommand(int(username_entry.get()))
    # I write the excel file as pandas dataframe, with only the necessary columns
    df= pd.read_excel("output.xlsx", usecols= 'C, D,E')
    # eI xtract number of rows and columns from the dataframe
    n_rows = df.shape[0]
    n_cols = df.shape[1]

    # I extract columns from the data and I create a text widget with some background color
    # I didn't make a loop, but I repeated the code because I wanted to have different width of the columns
    column_names = df.columns
    i=0
    text = tk.Text(secondary_window, width=15, height=1, bg='#00CCCC', font=("Arial", 16))
    text.grid(row=i,column=0)
    text.insert(tk.INSERT, 'Possible rate')

    text = tk.Text(secondary_window, width=40, height=1, bg='#00CCCC', font=("Arial", 16))
    text.grid(row=i,column=1)
    text.insert(tk.INSERT, 'Title')
    
    text = tk.Text(secondary_window, width=40, height=1, bg='#00CCCC', font=("Arial", 16))
    text.grid(row=i,column=2)
    text.insert(tk.INSERT, 'Genre')

    # NOw, I add all the other rows into the grid, mantaining the same width as the title
    for i in range(n_rows):

        text = tk.Text(secondary_window, width=15, height=1, bg='#333333', fg="#FFFFFF", font=("Arial", 16))
        text.grid(row=i+1,column=0)
        text.insert(tk.INSERT, df.loc[i][0])

        text = tk.Text(secondary_window, width=40, height=1, bg='#333333', fg="#FFFFFF", font=("Arial", 16))
        text.grid(row=i+1,column=1)
        text.insert(tk.INSERT, df.loc[i][1])

        text = tk.Text(secondary_window, width=40, height=1, bg='#333333', fg="#FFFFFF", font=("Arial", 16))
        text.grid(row=i+1,column=2)
        text.insert(tk.INSERT, df.loc[i][2])
      
    # This command is needed because when the secondary window is open, you cannot interact with the main one
    secondary_window.grab_set() 
    
# This function is exactly the same as the previous one, with the only difference that the file opened is the watched movies one
# I do not repeat the comments, since are identical   
def open_third_window():

    secondary_window = tk.Toplevel()
    secondary_window.title("Watched movies")
    secondary_window.config(bg='#333333')
    
    Recommand(int(username_entry.get()))
    df= pd.read_excel("watched.xlsx", usecols= 'C, D,E')
    
    n_rows = df.shape[0]
    n_cols = df.shape[1]

    column_names = df.columns
    i=0
    text = tk.Text(secondary_window, width=15, height=1, bg='#00CCCC', font=("Arial", 16))
    text.grid(row=i,column=0)
    text.insert(tk.INSERT, 'Possible rate')

    text = tk.Text(secondary_window, width=40, height=1, bg='#00CCCC', font=("Arial", 16))
    text.grid(row=i,column=1)
    text.insert(tk.INSERT, 'Title')
    
    text = tk.Text(secondary_window, width=40, height=1, bg='#00CCCC', font=("Arial", 16))
    text.grid(row=i,column=2)
    text.insert(tk.INSERT, 'Genre')
    

    for i in range(n_rows):

        text = tk.Text(secondary_window, width=15, height=1, bg='#333333', fg="#FFFFFF", font=("Arial", 16))
        text.grid(row=i+1,column=0)
        text.insert(tk.INSERT, df.loc[i][0])

        text = tk.Text(secondary_window, width=40, height=1, bg='#333333', fg="#FFFFFF", font=("Arial", 16))
        text.grid(row=i+1,column=1)
        text.insert(tk.INSERT, df.loc[i][1])

        text = tk.Text(secondary_window, width=40, height=1, bg='#333333', fg="#FFFFFF", font=("Arial", 16))
        text.grid(row=i+1,column=2)
        text.insert(tk.INSERT, df.loc[i][2])
    
    
    secondary_window.grab_set()  # Modal.

# Here, I Create the widgets
# Label with the request with a certain color and with a certain font and size
login_label = tk.Label(
    frame, text="  Choose the UserId desired", bg='#333333', fg='#00CCCC', font=("Arial", 30))
# Label with the username with a certain color and with a certain font and size
username_label = tk.Label(
    frame, text="Username", bg='#333333', fg="#FFFFFF", font=("Arial", 16))
# Here you can enter the chosen UserId
username_entry= tk.Entry(frame, font=("Arial", 16))
# I create the button that call the login function
login_button = tk.Button(
    frame, text="Recommand", bg="#00CCCC", fg="#FFFFFF", font=("Arial", 16), command=login)
# I create the button that call the look funtion 
liked_button = tk.Button(
    frame, text="My movies", bg="#00CCCC", fg="#FFFFFF", font=("Arial", 16), command=look)

# I place widgets on the screen
login_label.grid(row=0, column=0, columnspan=3, sticky="news", pady=40)
username_label.grid(row=1, column=0)
username_entry.grid(row=1, column=1,columnspan=2, pady=20)
login_button.grid(row=3, column=1, columnspan=1, pady=10)
liked_button.grid(row=4, column=1, columnspan=1, pady=10)

# I show my GUI
frame.pack()

window.mainloop()

TclError: no display name and no $DISPLAY environment variable