In [1]:
import pandas as pd
import numpy as np

### Loading the Datasets

In [2]:
df_links=pd.read_csv('raw_data/links.csv')
df_links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [3]:
df_movie=pd.read_csv('raw_data/movies.csv')
df_movie.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
df_rating=pd.read_csv('raw_data/ratings.csv')
df_rating.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
df_tag=pd.read_csv('raw_data/tags.csv')
df_tag.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


### 1. Finding the Total Number of Ratings in the Dataset

In [6]:
row,col=df_rating.shape
print(f"The total number of ratings in the ratings.csv file is : {row}")

The total number of ratings in the ratings.csv file is : 100836


### 2. Identifying the Movie with the Highest Average Rating 
#### (with at least 50 ratings)

In [7]:
grouped=df_rating.groupby('movieId')
movie_ratings=grouped['rating'].agg(['count','mean'])
print(movie_ratings)

         count      mean
movieId                 
1          215  3.920930
2          110  3.431818
3           52  3.259615
4            7  2.357143
5           49  3.071429
...        ...       ...
193581       1  4.000000
193583       1  3.500000
193585       1  3.500000
193587       1  3.500000
193609       1  4.000000

[9724 rows x 2 columns]


In [8]:
# Filter out movies with fewer than 50 ratings.

rating_50=movie_ratings[movie_ratings['count']>50]
rating_50 = rating_50.reset_index()  # Move movieId from index to column
print(rating_50)

     movieId  count      mean
0          1    215  3.920930
1          2    110  3.431818
2          3     52  3.259615
3          6    102  3.946078
4          7     54  3.185185
..       ...    ...       ...
431   106782     54  3.916667
432   109374     52  3.778846
433   109487     73  3.993151
434   112852     59  4.050847
435   122904     54  3.833333

[436 rows x 3 columns]


In [9]:
# Identifying the movie with the highest average rating.

high_rate = rating_50.loc[rating_50['mean'].idxmax(), "movieId"]
print(high_rate)


318


In [10]:
movie=df_movie[df_movie["movieId"]==high_rate]['title'].values[0]
print("movie with the highest rating is :" ,movie)

movie with the highest rating is : Shawshank Redemption, The (1994)


### 3. Determining the Most Common Rating Given by Users

In [11]:
most_common=df_rating['rating'].mode()
print("Most common ratings given by Users is : " , most_common)

Most common ratings given by Users is :  0    4.0
Name: rating, dtype: float64


### 4. Retrieving the IMDb Rating of the Highest-Rated Movie

In [12]:
# identified highest-rated movie

print("movie with the highest rating is :" ,movie)

movie with the highest rating is : Shawshank Redemption, The (1994)


In [13]:
#Merging the movie and links dataset

movie_link=pd.merge(df_movie,df_links)
movie_link.head()

Unnamed: 0,movieId,title,genres,imdbId,tmdbId
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844.0
2,3,Grumpier Old Men (1995),Comedy|Romance,113228,15602.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,31357.0
4,5,Father of the Bride Part II (1995),Comedy,113041,11862.0


In [14]:
#Extracting the IMDb ID of the highest-rated movie

imdb=movie_link[movie_link["title"]==movie]['imdbId'].values[0]
print(" IMDb ID of",movie ,"is  : " ,imdb)

 IMDb ID of Shawshank Redemption, The (1994) is  :  111161


In [15]:
#Scrapping code

import requests
from bs4 import BeautifulSoup

def scrapper(imdbId):
 id = str(int(imdbId))
 n_zeroes = 7 - len(id)
 new_id = "0" * n_zeroes + id
 URL = f"https://www.imdb.com/title/tt{new_id}/"
 print(f"Accessing URL: {URL}") # Debug print

 request_header = {
 'Content-Type': 'text/html; charset=UTF-8',
 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0)Gecko/20100101 Firefox/119.0',
 'Accept-Encoding': 'gzip, deflate, br'
 }

 response = requests.get(URL, headers=request_header)
 print(f"Response status code: {response.status_code}") # Debug print

 soup = BeautifulSoup(response.text, 'html.parser')
 imdb_rating = soup.find('span', attrs={'class': 'sc-eb51e184-1ljxVSS'})

 if imdb_rating:
     print(f"Found rating: {imdb_rating.text}")
 else:
     print("Rating not found")

 return imdb_rating.text if imdb_rating else np.nan

In [16]:
# Scrape IMDb rating
imdb_rating = scrapper(imdb)
print(f"IMDb Rating of {movie}: {imdb_rating}")

Accessing URL: https://www.imdb.com/title/tt0111161/
Response status code: 200
Rating not found
IMDb Rating of Shawshank Redemption, The (1994): nan


### 5. Counting Sci-Fi Movies with More Than 100 Ratings

In [17]:
#Filter movies that belong to the "Sci-Fi" genre

movie_rating=pd.merge(df_movie,df_rating)
movie_rating.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483


In [18]:
#Filter movies that belong to the "Sci-Fi" genre

sci_fi=movie_rating[movie_rating["genres"]=='Sci-Fi']
sci_fi

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
41853,2311,2010: The Year We Make Contact (1984),Sci-Fi,27,2.0,965151086
41854,2311,2010: The Year We Make Contact (1984),Sci-Fi,42,4.0,996259554
41855,2311,2010: The Year We Make Contact (1984),Sci-Fi,45,4.0,950740918
41856,2311,2010: The Year We Make Contact (1984),Sci-Fi,57,4.0,965798823
41857,2311,2010: The Year We Make Contact (1984),Sci-Fi,182,4.0,1055152324
...,...,...,...,...,...,...
100589,176371,Blade Runner 2049 (2017),Sci-Fi,515,5.0,1513602051
100590,176371,Blade Runner 2049 (2017),Sci-Fi,567,5.0,1525282063
100591,176371,Blade Runner 2049 (2017),Sci-Fi,586,2.5,1529899177
100592,176371,Blade Runner 2049 (2017),Sci-Fi,599,3.5,1508605770


In [19]:
# Count the number of ratings for each Sci-Fi movie

sci_fi_counts = sci_fi.groupby("movieId")["rating"].count().reset_index()
sci_fi_counts

Unnamed: 0,movieId,rating
0,2311,17
1,2526,3
2,2661,1
3,2665,1
4,2698,1
5,3354,21
6,4198,1
7,4813,1
8,4942,1
9,5468,1


In [20]:
#filter Sci-Fi movies that have received more than 100 ratings.

popular_sci_fi = sci_fi_counts[sci_fi_counts["rating"] > 100]
popular_sci_fi

Unnamed: 0,movieId,rating


In [21]:
# Print the number of such movies
print("Number of Sci-Fi movies with more than 100 ratings:", len(popular_sci_fi))

Number of Sci-Fi movies with more than 100 ratings: 0


# 2nd Set 

### 1. How many CSV files are in the dataset directory?

In [22]:
import os

directory='raw_data'
list= os.listdir(directory)
print(list)
c=0
for file in list:
    if file.endswith(".csv"):
        c+=1
        
print("The number of CSV files in Directory is : ",c)

['links.csv', 'movies.csv', 'ratings.csv', 'README.txt', 'tags.csv']
The number of CSV files in Directory is :  4


### 2. What does movies_df.shape return?

In [23]:
df_movie.shape

(9742, 3)

it returns number of rows i.e 9742 and columns i.e 3

### 3. What does ratings_df['userId'].nunique() return?

In [24]:
df_rating['userId'].nunique()

610

 it returns the number of unique values present in the column userId

### 4. How is the most-rated movie identified in the dataset?

In [25]:
df_rating.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [26]:
df_rating['movieId'].value_counts()

356       329
318       317
296       307
593       279
2571      278
         ... 
86279       1
86922       1
5962        1
87660       1
163981      1
Name: movieId, Length: 9724, dtype: int64

In [27]:
df_rating['movieId'].value_counts().max()

329

### 5. What type of data does tags_df[tags_df['movieId'] == matrix_movie_id]['tag'].unique() return?

In [28]:
matrix_movie_id = df_movie[df_movie["title"] == "Matrix, The (1999)"]["movieId"].values[0]
print("Id of Matrix, The (1999) movie is :" , matrix_movie_id)

df_tag[df_tag['movieId'] == matrix_movie_id]['tag'].unique()

Id of Matrix, The (1999) movie is : 2571


array(['martial arts', 'sci-fi', 'alternate universe', 'philosophy',
       'post apocalyptic'], dtype=object)

It returns the unique tags assigned to The Matrix (1999).

### 6. If mean_rating = sum(ratings) / len(ratings), what does it calculate?

In [29]:
sum=df_rating["rating"].sum()

In [30]:
length=len(df_rating["rating"])
length

100836

In [31]:
mean_rating=sum/length
print("The average : ",mean_rating)

The average :  3.501556983616962


 It gives the average of all the ratings.

### 7. What does stats.skew(fight_club_ratings) measure?


In [32]:
from scipy.stats import skew

Fight_id=df_movie[df_movie["title"]=="Fight Club (1999)"]["movieId"].values[0]
print("Fight Club movie ID : ",Fight_id)

rating=df_rating[df_rating['movieId']==Fight_id]['rating']
print("Fight Club movie rating : \n",rating)

# Calculate skewness
skewness = skew(rating)

print(f"Skewness of Fight Club ratings: {skewness}")

Fight Club movie ID :  2959
Fight Club movie rating : 
 192      5.0
458      2.0
1134     0.5
1479     2.5
1635     3.5
        ... 
95893    5.0
96788    4.0
97835    5.0
99107    5.0
99699    5.0
Name: rating, Length: 218, dtype: float64
Skewness of Fight Club ratings: -1.8474937360359363


 as rating is in negative values its means it is left-skewed.

### 8. What filter is applied to filtered_df before sorting popular movies?

In [33]:
grouped=df_rating.groupby("movieId")
count=grouped["rating"].count().reset_index()
print(count)

      movieId  rating
0           1     215
1           2     110
2           3      52
3           4       7
4           5      49
...       ...     ...
9719   193581       1
9720   193583       1
9721   193585       1
9722   193587       1
9723   193609       1

[9724 rows x 2 columns]


In [34]:
print(count[count["rating"]>50])

      movieId  rating
0           1     215
1           2     110
2           3      52
5           6     102
6           7      54
...       ...     ...
8287   106782      54
8354   109374      52
8358   109487      73
8457   112852      59
8673   122904      54

[436 rows x 2 columns]


### 11. What is the total number of ratings in the dataset?

In [35]:
print("The toral number of rows in the rating dataset is : ",df_rating.shape[0])

The toral number of rows in the rating dataset is :  100836
