## Movie Recommendation Analysis

#### Loading the Datasets

In [1]:
import pandas as pd
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

#### Inspecting Data

In [None]:
#Inspect first few rows of the dataset
print(movies.head())
print(ratings.head())

In [None]:
#Get an overview of data types and null values
print(movies.info())
print(ratings.info())

In [12]:
#Check for duplicated rows
print(f"Duplicate rows in movies: {movies.duplicated().sum()}") 
print(f"Duplicated rows in ratings: {ratings.duplicated().sum()}")

Duplicate rows in movies: 0
Duplicated rows in ratings: 0


In [7]:
#Shape before changes
print(f"Movies dataset size before removing duplicates: {movies.shape}")
print(f"Ratings dataset size before removing duplicates: {ratings.shape}")

Movies dataset size before removing duplicates: (87585, 3)
Ratings dataset size before removing duplicates: (32000204, 4)


#### Data Cleaning

Checking for missing values

In [None]:
#Check for null values
print(movies.isnull().sum())
print(ratings.isnull().sum())

In [4]:
#Drop rows with null values
movies.dropna(inplace=True)
ratings.dropna(inplace=True)

Remove Duplicates

In [None]:
movies.drop_duplicates(inplace=True)
ratings.drop_duplicates(inplace=True)

#Veryfying changes
print(f"Movies dataset size after removing duplicates: {movies.shape}")
print(f"Ratings dataset size after removing duplicates: {ratings.shape}")

Summary of Cleaned Data

In [None]:
print(movies.info())
print(ratings.info())
print(f"Number of unique movies: {movies['movieId'].nunique()}")
print(f"Number of unique users: {ratings['userId'].nunique()}")

### Exploratory Data Analysis

Distribution of Ratings

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

#Plot distribution of ratings
plt.figure(figsize=(8,5)) #Set figure size (width, height)
sns.histplot(ratings['rating'], bins=10, kde=False) #Creates histogram of ratings
plt.title('Distribution of Movie Ratings') #Title
plt.xlabel('Rating')
plt.ylabel('Count')
plt.show()

Most Rated Movies

In [None]:
# Count ratings for each movie
rating_counts = ratings.groupby('movieId')['rating'].count().sort_values(ascending=False) 

# Display top 10 most-rated movies
top_rated_movies = movies[movies['movieId'].isin(rating_counts.head(10).index)]
print(top_rated_movies[['movieId', 'title']])


Average Ratings

In [None]:
# Calculate average rating
average_ratings = ratings.groupby('movieId')['rating'].mean()

# Combine average ratings with movie titles
movie_avg_ratings = movies.set_index('movieId').join(average_ratings.rename('average_rating'))
print(movie_avg_ratings[['title', 'average_rating']].sort_values(by='average_rating', ascending=False).head(10))


Ratings by Genre

In [None]:
# Explode genres into individual rows
movies_exploded = movies.explode('genres')

# Merge with ratings and calculate average ratings by genre
genre_ratings = pd.merge(ratings, movies_exploded, on='movieId')
average_genre_ratings = genre_ratings.groupby('genres')['rating'].mean().sort_values(ascending=False)
print(average_genre_ratings)


### Recommendation System

Popularity-Based Recommendation

In [None]:
# Set a threshold for the minimum number of ratings
min_ratings = 50

# Filter movies with enough ratings
popular_movies = ratings.groupby('movieId').filter(lambda x: len(x) >= min_ratings)

# Calculate average ratings for these movies
popular_movies_avg = popular_movies.groupby('movieId')['rating'].mean()

# Combine with movie titles
popular_movies_df = movies.set_index('movieId').join(popular_movies_avg.rename('average_rating'))
recommended_movies = popular_movies_df.sort_values(by='average_rating', ascending=False).head(10)

print(recommended_movies[['title', 'average_rating']])


Genre-Based Recommendation

In [None]:
# Choose a genre
genre = 'Action'

# Filter movies by genre
action_movies = movies[movies['genres'].apply(lambda x: genre in x)]

# Merge with average ratings
action_movies_ratings = action_movies.set_index('movieId').join(average_ratings.rename('average_rating'))

# Recommend top-rated action movies
recommended_action_movies = action_movies_ratings.sort_values(by='average_rating', ascending=False).head(10)

print(recommended_action_movies[['title', 'average_rating']])


Visualizing the results

In [None]:
# Plot the top-rated movies
plt.figure(figsize=(8, 5))
recommended_movies['average_rating'].plot(kind='bar')
plt.title('Top 10 Highly Rated Movies')
plt.xlabel('Movie Title')
plt.ylabel('Average Rating')
plt.show()

#### Author
Ujaan Banerjee