In [191]:
import pandas as pd
import numpy as np
import plotly.express as px # data visualisation
from sklearn.model_selection import train_test_split


# Import the dataset MovieLens

In [13]:
def clean_movie_title(movie_title):
    """
    Cleans up a movie title string by removing unnecessary characters and formatting.
    
    This function performs several operations on the input movie title:
    
    1. Removes the year enclosed in parentheses at the end of the title if present.
    2. Corrects the capitalization of articles ('The', 'A') when they precede the main title.
    3. Converts the entire title to lowercase to ensure consistency in case sensitivity.
    
    Parameters:
    - movie_title (str): The original movie title string that needs cleaning.
    
    Returns:
    - str: The cleaned-up movie title string.
    """
    if movie_title.split(" ")[-1].startswith("("):
        # remove year from the title, e.g. Toy Story (1995) --> Toy Story
        movie_title = (" ".join(movie_title.split(" ")[:-1])).strip()

    if movie_title.title().split(',')[-1].strip() in ['The', 'A']:
        # article + movie title, e.g. Saint, The --> The Saint
        movie_title = (movie_title.title().split(',')[-1].strip() + " " + " ".join(movie_title.title().split(',')[:-1])).strip()

    # otherwise, it was converting The Devil's Advocate to The Devil'S Advocate
    movie_title = movie_title.lower()
    return movie_title

In [66]:
unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_table('data/ml-1m/users.dat', sep='::',
                      header=None, names=unames, engine='python')

rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table('data/ml-1m/ratings.dat', sep='::',
 header=None, names=rnames, engine='python')

mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('data/ml-1m/movies.dat', sep='::',
 header=None, names=mnames, engine='python', encoding='latin-1')

movies["release"] = movies["title"].apply(lambda row: row.split("(")[-1].strip(")"))
movies["title"] = movies["title"].apply(lambda row : clean_movie_title(row))

In [68]:
print(movies)

Unnamed: 0,movie_id,title,genres,release
0,1,toy story,Animation|Children's|Comedy,1995
1,2,jumanji,Adventure|Children's|Fantasy,1995
2,3,grumpier old men,Comedy|Romance,1995
3,4,waiting to exhale,Comedy|Drama,1995
4,5,father of the bride part ii,Comedy,1995


In [67]:
data = pd.merge(pd.merge(ratings, users), movies)

In [22]:
data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres
0,1,1193,5,978300760,F,1,10,48067,one flew over the cuckoo's nest,Drama
1,2,1193,5,978298413,M,56,16,70072,one flew over the cuckoo's nest,Drama
2,12,1193,4,978220179,M,25,12,32793,one flew over the cuckoo's nest,Drama
3,15,1193,4,978199279,M,25,7,22903,one flew over the cuckoo's nest,Drama
4,17,1193,5,978158471,M,50,1,95350,one flew over the cuckoo's nest,Drama


# Import Dataset IMDb

In [69]:
imdb_name = pd.read_table('data/name.basics.tsv')
imdb_movies = pd.read_table('data/title.basics.tsv')

  imdb_movies = pd.read_table('data/title.basics.tsv')


In [70]:
imdb_movies = imdb_movies[imdb_movies['titleType'] == 'movie']
imdb_movies.drop('endYear',axis = 1,inplace = True)
imdb_movies.drop('titleType',axis = 1,inplace = True)
imdb_movies.drop('isAdult',axis = 1,inplace = True)
imdb_movies.drop('primaryTitle',axis = 1,inplace = True)
imdb_movies.rename(columns={"originalTitle": "title"}, inplace=True)
imdb_movies.rename(columns={"startYear": "release"}, inplace=True)
imdb_movies["title"] = imdb_movies["title"].astype(str)
imdb_movies["title"] = imdb_movies["title"].apply(lambda row : row.lower())

imdb_movies.head()

Unnamed: 0,tconst,title,release,runtimeMinutes,genres
8,tt0000009,miss jerry,1894,45,Romance
144,tt0000147,the corbett-fitzsimmons fight,1897,100,"Documentary,News,Sport"
498,tt0000502,bohemios,1905,100,\N
570,tt0000574,the story of the kelly gang,1906,70,"Action,Adventure,Biography"
587,tt0000591,l'enfant prodigue,1907,90,Drama


In [71]:
print(imdb_movies.shape)
print(data.shape)

(685520, 5)
(1000209, 11)


# Merge Datasets

In [72]:
df_movies = pd.merge(imdb_movies, movies, on=["title", "release"])

In [210]:
#df_movies.drop('genres_y',axis = 1,inplace = True)
df_movies.rename(columns={"genres_x": "genres"}, inplace=True)
df_movies.head()

Unnamed: 0,tconst,title,release,runtimeMinutes,genres,movie_id
0,tt0010418,male and female,1919,116,"Adventure,Drama",2821
1,tt0011652,the saphead,1920,77,"Comedy,Drama,Romance",3231
2,tt0012349,the kid,1921,68,"Comedy,Drama,Family",3310
3,tt0013662,tess of the storm country,1922,137,"Drama,Romance",3195
4,tt0015693,cobra,1925,70,"Drama,Romance",2777


In [211]:
df_movies.shape

(3075, 6)

# Data Visualisation

In [212]:
year = df_movies['release'].value_counts().sort_values(ascending = False).to_frame()
year.reset_index(inplace = True)
year.columns.values[0] = 'year'
year.columns.values[1] = 'num_of_films'
year = year.drop(0)
year

Unnamed: 0,year,num_of_films
1,1995,263
2,1997,253
3,1998,251
4,1999,232
5,1994,181
...,...,...
75,1928,3
76,1920,1
77,1922,1
78,1921,1


In [213]:
# ploting the data 
px.bar(year, x= 'year', y='num_of_films',color = 'num_of_films',
              labels={'year': 'Release Year', 'num_of_films': 'Total Movies'},
              title='Distribution of Movies by Release Year',template = 'presentation')

In [214]:
df_ratings = pd.merge(ratings, df_movies, on=['movie_id'])

# Model Building

In [218]:
df_train, df_test = train_test_split(df_ratings, test_size=0.2, random_state=42)

print(df_train.shape)
print(df_test.shape)

(716048, 9)
(179012, 9)


In [199]:
df_matrix_train = df_train.pivot_table(values='rating', columns='movie_id', index='user_id').fillna(0)
df_matrix_test = df_test.pivot_table(values='rating', columns='movie_id', index='user_id').fillna(0)

In [200]:
print(df_matrix_train.shape)
print(df_matrix_test.shape)

(6040, 2941)
(6034, 2825)


# Data combination for couple user

In [223]:
def combine_user(user_id, user_id2, matrix):
    combine = [0] * matrix.shape[1]
    for i in range(matrix.shape[1]):
        if matrix[user_id][i] == 0:
            combine[i] = matrix[user_id2][i]
        elif matrix[user_id2][i] == 0:
            combine[i] = matrix[user_id][i]
        else:
            combine[i] = (matrix[user_id][i] + matrix[user_id2]) / 2
    return combine

In [228]:
user_couple = combine_user(1, 2, df_matrix_train.values)
len(user_couple)

2941

# Matrix factorization recommendation

In [202]:
def als(
    user_item_matrix: np.ndarray, num_features: int, lambda_: float, iterations: int
) -> tuple:
    """
    Perform matrix factorization using Alternating Least Squares (ALS).

    Parameters
    ----------
    user_item_matrix : np.ndarray
        User-item rating matrix with users as rows and items as columns.
    num_features : int
        Number of latent features.
    lambda_ : float
        Regularization parameter.
    iterations : int
        Number of iterations to run the algorithm.

    Returns
    -------
    tuple
        Tuple of user_features and item_features matrices.
    """
    num_users, num_items = user_item_matrix.shape
    user_features = np.random.normal(
        scale=1.0 / num_features, size=(num_users, num_features)
    )
    item_features = np.random.normal(
        scale=1.0 / num_features, size=(num_items, num_features)
    )

    # Create a mask of observed entries
    observed = ~np.isnan(user_item_matrix)
    user_item_matrix = np.nan_to_num(user_item_matrix)

    # Precompute Iλ (λ times the identity matrix of dimension k x k)
    Iλ = lambda_ * np.eye(num_features)

    for _ in range(iterations):
        # Update user features
        for i in range(num_users):
            Vj = item_features[observed[i], :]  # Filter to only items rated by user i
            YTY = np.dot(Vj.T, Vj)  # Vj.T * Vj
            user_features[i] = np.linalg.solve(
                YTY + Iλ, np.dot(Vj.T, user_item_matrix[i, observed[i]])
            )

        # Update item features
        for j in range(num_items):
            Ui = user_features[
                observed[:, j], :
            ]  # Filter to only users that rated item j
            UTU = np.dot(Ui.T, Ui)  # Ui.T * Ui
            item_features[j] = np.linalg.solve(
                UTU + Iλ, np.dot(Ui.T, user_item_matrix[observed[:, j], j])
            )

    return user_features, item_features

user_features, item_features = als(df_matrix_train.values, num_features=20, lambda_=0.1, iterations=10)
print("User Features:\n", user_features)
print("Item Features:\n", item_features)

User Features:
 [[ 0.14766394  0.0551009   0.42189477 ...  0.27383483 -0.21708234
   0.38855082]
 [ 0.09093568  0.34615196 -0.3271519  ... -0.27428747 -0.15292649
   0.22284294]
 [ 0.02866594  0.16613097  0.19028363 ... -0.11904295 -0.15148862
   0.18457103]
 ...
 [-0.08450423  0.06432483  0.10649798 ... -0.0475926   0.02319975
   0.08081976]
 [-0.87980083 -0.24072826  0.34616324 ...  0.54805643  0.24940402
   0.12707459]
 [ 0.09582926  0.32561052 -1.04348941 ...  0.77080014  0.30585507
  -0.07428837]]
Item Features:
 [[ 0.19562775  1.64904839  0.27643938 ...  0.25879943  0.788741
   0.88418251]
 [ 0.11681521  0.05223485  0.06435617 ... -0.18655685  0.56171388
   0.17473464]
 [-0.19620932  0.11720268  0.22090584 ... -0.34530425  0.3221862
   0.03941749]
 ...
 [-0.00501933  0.07941186  0.01870306 ...  0.03007644  0.01158929
  -0.05586295]
 [ 0.02470086  0.0171268   0.02719511 ...  0.0413527   0.02848263
   0.02405479]
 [ 0.149952    0.47248314  0.24593368 ...  0.10367926 -0.04312796
  -

In [206]:
def generate_recommendations(
    user_features: np.ndarray,
    item_features: np.ndarray,
    user_item_matrix: np.ndarray,
    top_n: int = 5,
) -> dict:
    """
    Generate recommendations for each user.

    Parameters
    ----------
    user_features : np.ndarray
        Matrix containing the latent features for each user.
    item_features : np.ndarray
        Matrix containing the latent features for each item.
    user_item_matrix : np.ndarray
        Original matrix of users and items with missing values.
    top_n : int
        Number of recommendations to return for each user.

    Returns
    -------
    dict
        A dictionary where keys are user indices and values are lists of recommended item indices.
    """
    # Compute full matrix of predicted ratings
    predicted_ratings = np.dot(user_features, item_features.T)

    # Generate recommendations
    recommendations = {}
    for user_index in range(1, user_item_matrix.shape[0]):
        # Get top N item indices
        recommended_items = np.argsort(predicted_ratings[user_index - 1])[-top_n:][::-1]
        recommendations[user_index] = recommended_items.tolist()

    return recommendations


recommendations = generate_recommendations(
    user_features, item_features, df_matrix_train.values, top_n=5
)
print("Recommendations (user: [items]):", recommendations)

Recommendations (user: [items]): {1: [2316, 281, 0, 475, 469], 2: [91, 1481, 421, 245, 471], 3: [2125, 91, 922, 1481, 875], 4: [876, 874, 886, 906, 630], 5: [2228, 2125, 2198, 1937, 1773], 6: [1774, 2534, 2792, 274, 2316], 7: [91, 470, 379, 1918, 358], 8: [91, 2125, 1481, 1255, 245], 9: [245, 421, 1255, 473, 44], 10: [763, 672, 770, 667, 653], 11: [2125, 245, 232, 484, 1255], 12: [630, 872, 92, 484, 912], 13: [876, 874, 1481, 91, 886], 14: [2125, 2228, 2056, 1774, 1937], 15: [2056, 2663, 2342, 2125, 2793], 16: [1741, 2316, 2056, 2009, 2000], 17: [435, 676, 1155, 1918, 470], 18: [91, 875, 281, 1481, 908], 19: [1016, 876, 949, 2056, 939], 20: [91, 1481, 470, 2125, 379], 21: [2316, 1741, 0, 2055, 874], 22: [2125, 232, 875, 872, 922], 23: [1278, 435, 877, 676, 889], 24: [245, 2125, 922, 421, 875], 25: [1918, 874, 886, 470, 2823], 26: [477, 433, 468, 10, 259], 27: [665, 666, 675, 915, 661], 28: [665, 875, 245, 876, 895], 29: [630, 889, 876, 1016, 906], 30: [245, 473, 484, 2125, 232], 31: [2

# Matrix factorization Rating Prediction

In [229]:
def matrix_factorization(R, P, Q, K, steps=10, alpha=0.0002, beta=0.02):
    '''
    R: rating matrix
    P: |U| * K (User features matrix)
    Q: |D| * K (Item features matrix)
    K: latent features
    steps: iterations
    alpha: learning rate
    beta: regularization parameter
    '''
    Q = Q.T

    for step in range(steps):
        print(f'step {step}')
        for i in range(len(R)):
            for j in range(len(R[i])):
                if R[i][j] > 0:
                    # calculate error
                    eij = R[i][j] - np.dot(P[i,:],Q[:,j])
                    for k in range(K):
                        # calculate gradient with a and beta parameter
                        P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
                        Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])

        e = 0
        for i in range(len(R)):
            for j in range(len(R[i])):
                if R[i][j] > 0:
                    e = e + pow(R[i][j] - np.dot(P[i,:],Q[:,j]), 2)
                    for k in range(K):
                        e = e + (beta/2) * (pow(P[i][k],2) + pow(Q[k][j],2))

        # 0.001: local minimum
        if e < 0.001:
            break

    return P, Q.T

In [204]:
nP, nQ = matrix_factorization(df_matrix_train.values, user_features, item_features, 20)

step 0
step 1
step 2
step 3
step 4
step 5
step 6
step 7
step 8
step 9


In [205]:
nR = np.dot(nP, nQ.T)
print(nR)

[[4.21894678 2.20567027 1.24939046 ... 0.6130792  0.39590729 1.52663447]
 [2.98310849 2.52426446 2.26517352 ... 0.76308519 0.33861339 1.66322228]
 [2.81438319 1.91831974 1.57376331 ... 0.39890701 0.19117785 1.08234458]
 ...
 [1.25694956 0.5486279  0.60756261 ... 0.19632009 0.12638608 0.42093436]
 [3.25003271 1.9453285  1.70200207 ... 0.62457283 0.35222086 1.13186761]
 [3.13895777 1.45550542 0.18751011 ... 1.22208847 0.809031   1.94767856]]
