In [1]:
import pandas as pd
import warnings

# Suppressing all warnings
warnings.filterwarnings("ignore")


# Loading movies.dat
movies_cols = ['MovieID', 'Title', 'Genres']
movies = pd.read_csv('movies.dat', sep='::', engine='python', header=None, names=movies_cols, encoding='latin-1')

# Loading ratings.dat
ratings_cols = ['UserID', 'MovieID', 'Rating', 'Timestamp']
ratings = pd.read_csv('ratings.dat', sep='::', engine='python', header=None, names=ratings_cols, encoding='latin-1')

# Loading users.dat
users_cols = ['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code']
users = pd.read_csv('users.dat', sep='::', engine='python', header=None, names=users_cols, encoding='latin-1')

# Merging data
movie_ratings = pd.merge(ratings, movies, on='MovieID')
full_data = pd.merge(movie_ratings, users, on='UserID')

# Displaying the first few rows of the merged DataFrame
print(full_data.head())


   UserID  MovieID  Rating  Timestamp                                   Title  \
0       1     1193       5  978300760  One Flew Over the Cuckoo's Nest (1975)   
1       1      661       3  978302109        James and the Giant Peach (1996)   
2       1      914       3  978301968                     My Fair Lady (1964)   
3       1     3408       4  978300275                  Erin Brockovich (2000)   
4       1     2355       5  978824291                    Bug's Life, A (1998)   

                         Genres Gender  Age  Occupation Zip-code  
0                         Drama      F    1          10    48067  
1  Animation|Children's|Musical      F    1          10    48067  
2               Musical|Romance      F    1          10    48067  
3                         Drama      F    1          10    48067  
4   Animation|Children's|Comedy      F    1          10    48067  


In [2]:
full_data.tail()

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Title,Genres,Gender,Age,Occupation,Zip-code
1000204,4211,3791,2,965319075,Footloose (1984),Drama,M,45,5,77662
1000205,4211,3806,3,965319138,MacKenna's Gold (1969),Western,M,45,5,77662
1000206,4211,3840,4,965319197,Pumpkinhead (1988),Horror,M,45,5,77662
1000207,4211,3766,2,965319138,Missing in Action (1984),Action|War,M,45,5,77662
1000208,4211,3834,2,965318885,Bronco Billy (1980),Adventure|Drama|Romance,M,45,5,77662


In [3]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

# Assuming you have already loaded and merged your data into the 'full_data' DataFrame

# Dropping null values from the full_data DataFrame
full_data = full_data.dropna()

# Creating user-item matrix
user_item_matrix = full_data.pivot_table(index='UserID', columns='MovieID', values='Rating')

# Filling missing values with 0 (assuming missing values mean the user hasn't rated the movie)
user_item_matrix = user_item_matrix.fillna(0)

# Splitting the data into training and testing sets
train_data, test_data = train_test_split(full_data, test_size=0.2, random_state=42)

# Creating user-item matrix for training data
train_user_item_matrix = train_data.pivot_table(index='UserID', columns='MovieID', values='Rating').fillna(0)

# Calculating cosine similarity between users
user_similarity = cosine_similarity(train_user_item_matrix)

# Making predictions on the test set
def predict_ratings(user_similarity, user_item_matrix):
    mean_user_rating = user_item_matrix.mean(axis=1)
    ratings_diff = (user_item_matrix - mean_user_rating[:, np.newaxis])
    pred = mean_user_rating[:, np.newaxis] + user_similarity.dot(ratings_diff) / np.array([np.abs(user_similarity).sum(axis=1)]).T
    return pred

# Predicting ratings for the test set
user_item_pred = predict_ratings(user_similarity, train_user_item_matrix)

# Flatten the predictions and true ratings for evaluation
pred_ratings = []
for index, row in test_data.iterrows():
    user_id = row['UserID']
    movie_id = row['MovieID']
    if user_id in train_user_item_matrix.index and movie_id in train_user_item_matrix.columns:
        pred_ratings.append(user_item_pred[train_user_item_matrix.index == user_id].flatten()[train_user_item_matrix.columns == movie_id].flatten()[0])
    else:
        pred_ratings.append(np.nan)

# Converting the list to a NumPy array
pred_ratings = np.array(pred_ratings)

# Removing NaN values from the true ratings and predictions
true_ratings = test_data['Rating'].values[~np.isnan(pred_ratings)]
pred_ratings = pred_ratings[~np.isnan(pred_ratings)]

# Calculating RMSE
rmse = np.sqrt(mean_squared_error(true_ratings, pred_ratings))
print("RMSE:", rmse)


RMSE: 3.0826285317524964


In [4]:
# Calculating Mean Absolute Error (MAE)
mae = np.abs(true_ratings - pred_ratings).mean()
print("MAE:", mae)



MAE: 2.8839691920658113


In [5]:
# Dropping NaN values from the predictions and true ratings
pred_ratings_no_nan = pred_ratings[~np.isnan(pred_ratings)]
true_ratings_no_nan = true_ratings[~np.isnan(pred_ratings)]

# Creating a DataFrame with non-NaN predictions and true ratings
example_predictions = pd.DataFrame({'UserID': test_data['UserID'].values[:len(pred_ratings_no_nan)],
                                     'MovieID': test_data['MovieID'].values[:len(pred_ratings_no_nan)],
                                     'TrueRating': true_ratings_no_nan,
                                     'PredRating': pred_ratings_no_nan})

# Displaying some example predictions
print("\nExample Predictions:")
print(example_predictions.head())



Example Predictions:
   UserID  MovieID  TrueRating  PredRating
0    3466     1968           5    0.919202
1    5437     1610           4    1.184454
2     770      445           3    0.376651
3     889     2696           2    0.829001
4    2203     2013           5    0.718088


In [6]:
# Assuming 'movies' DataFrame has columns 'MovieID' and 'Title'
# If not, replace it with the actual DataFrame containing movie information

# Drop NaN values from the predictions and true ratings
pred_ratings_no_nan = pred_ratings[~np.isnan(pred_ratings)]
true_ratings_no_nan = true_ratings[~np.isnan(pred_ratings)]

# Creating a DataFrame with non-NaN predictions and true ratings
example_predictions = pd.DataFrame({'UserID': test_data['UserID'].values[:len(pred_ratings_no_nan)],
                                     'MovieID': test_data['MovieID'].values[:len(pred_ratings_no_nan)],
                                     'TrueRating': true_ratings_no_nan,
                                     'PredRating': pred_ratings_no_nan})

# Merging with the 'movies' DataFrame to include movie names
example_predictions = example_predictions.merge(movies[['MovieID', 'Title']], on='MovieID', how='left')

# Displaying some example predictions with movie names
print("\nExample Predictions with Movie Names:")
print(example_predictions.head())



Example Predictions with Movie Names:
   UserID  MovieID  TrueRating  PredRating  \
0    3466     1968           5    0.919202   
1    5437     1610           4    1.184454   
2     770      445           3    0.376651   
3     889     2696           2    0.829001   
4    2203     2013           5    0.718088   

                                        Title  
0                  Breakfast Club, The (1985)  
1            Hunt for Red October, The (1990)  
2                       Fatal Instinct (1993)  
3  Dinner Game, The (Le Dîner de cons) (1998)  
4              Poseidon Adventure, The (1972)  
