In [2]:
import pandas as pd

# Load ratings
ratings = pd.read_csv('../ml-1m/ratings.dat', sep='::', engine='python', names=['userId', 'movieId', 'rating', 'timestamp'])

ratings.head() # First 5 rows of the ratings DataFrame

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [3]:
# Load movies
movies = pd.read_csv('../ml-1m/movies.dat', sep='::', engine='python', names=['movieId', 'title', 'genres'], encoding='latin-1')

movies.head() # First 5 rows of the movies DataFrame

# encoding = latin-1 is used to handle special characters in movie titles as UTF-8 may not work correctly with some characters in the dataset.

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
# Load users
users = pd.read_csv('../ml-1m/users.dat', sep='::', engine='python', names=["userId", "gender", "age", "occupation", "zipCode"])

users.head() # First 5 rows of the users DataFrame

Unnamed: 0,userId,gender,age,occupation,zipCode
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [8]:
# Handling missing values/null values
'''In 1M MovieLens dataset, there are no missing values in the ratings, movies, and users datasets.
However, if we were to check for missing values, we could use the following code:'''
# Missing values
print("Missing Values:\n")
print("Movies:\n", movies.isnull().sum())
print("Ratings:\n", ratings.isnull().sum())
print("Users:\n", users.isnull().sum())

# If any missing values are found, we can handle them by either dropping the rows or filling them with appropriate values.
movies.fillna('Unknowm', inplace=True)  # Fill missing values in movies with Unknown

Missing Values:

Movies:
 movieId    0
title      0
genres     0
dtype: int64
Ratings:
 userId       0
movieId      0
rating       0
timestamp    0
dtype: int64
Users:
 userId        0
gender        0
age           0
occupation    0
zipCode       0
dtype: int64


In [9]:
# Checking for duplicates
print("\nDuplicates:\n")
print("Movies:\n", movies.duplicated().sum())
print("Ratings:\n", ratings.duplicated().sum())
print("Users:\n", users.duplicated().sum())

# If duplicates are found, we can drop them
movies.drop_duplicates(inplace=True)
ratings.drop_duplicates(inplace=True)
users.drop_duplicates(inplace=True)


Duplicates:

Movies:
 0
Ratings:
 0
Users:
 0


In [11]:
# Meriging ratings with movies
movie_data = pd.merge(ratings, movies, on='movieId')

movie_data.head() # First 5 rows of the merged movie_data DataFrame

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,1,661,3,978302109,James and the Giant Peach (1996),Animation|Children's|Musical
2,1,914,3,978301968,My Fair Lady (1964),Musical|Romance
3,1,3408,4,978300275,Erin Brockovich (2000),Drama
4,1,2355,5,978824291,"Bug's Life, A (1998)",Animation|Children's|Comedy


In [15]:
# To csv
movies.to_csv('../data/movies.csv', index=False)
ratings.to_csv('../data/ratings.csv', index=False)
users.to_csv('../data/users.csv', index=False)
movie_data.to_csv('../data/movie_data.csv', index=False)

print("\nData saved to CSV files in the '../data/' directory.")


Data saved to CSV files in the '../data/' directory.
