<a href="https://colab.research.google.com/github/VijayS-001/hotstar-viewership-analytics-recommender/blob/main/Hotstar_Viewership_Analytics_%26_Genre_Recommendation_Engine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Analyzing Viewing Trends and Generating Genre-Based Recommendations on Hotstar

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
df = pd.read_csv("/content/hotstar.csv")
df.head(5)

Unnamed: 0,hotstar_id,title,description,genre,year,age_rating,running_time,seasons,episodes,type
0,1000087439,Sambha - Aajcha Chawa,A young man sets off on a mission to clean up ...,Action,2012,U/A 16+,141.0,,,movie
1,1260023113,Cars Toon: Mater And The Ghostlight,Mater is haunted by a mysterious blue light th...,Animation,2006,U,7.0,,,movie
2,1260103188,Kanmani Rambo Khatija,"Unlucky since birth, Rambo finds hope when he ...",Romance,2022,U/A 16+,157.0,,,movie
3,1260126754,Butterfly,While trying to rescue her sister's kids from ...,Thriller,2022,U/A 16+,136.0,,,movie
4,1260018228,Sister Act,"Rene, a lounge singer, decides to stay at a Ch...",Comedy,1992,U/A 7+,100.0,,,movie


In [3]:
print(f'The total number of columns is {df.shape[1]}')


The total number of columns is 10


In [4]:
print(f'The total number of rows is {df.shape[0]}')

The total number of rows is 6874


In [5]:
df.isnull().sum()

Unnamed: 0,0
hotstar_id,0
title,0
description,0
genre,0
year,0
age_rating,0
running_time,2306
seasons,4568
episodes,4568
type,0


In [6]:
df.duplicated().sum()

np.int64(0)

In [7]:
df.describe()

Unnamed: 0,hotstar_id,year,running_time,seasons,episodes
count,6874.0,6874.0,4568.0,2306.0,2306.0
mean,1059077000.0,2011.71865,98.746716,2.661752,127.366869
std,481266600.0,11.936894,49.411142,4.942716,258.138186
min,3.0,1928.0,1.0,1.0,1.0
25%,1000088000.0,2009.0,70.0,1.0,6.0
50%,1260008000.0,2016.0,116.0,1.0,22.0
75%,1260099000.0,2019.0,135.0,2.0,130.75
max,1837059000.0,2023.0,229.0,73.0,3973.0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6874 entries, 0 to 6873
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   hotstar_id    6874 non-null   int64  
 1   title         6874 non-null   object 
 2   description   6874 non-null   object 
 3   genre         6874 non-null   object 
 4   year          6874 non-null   int64  
 5   age_rating    6874 non-null   object 
 6   running_time  4568 non-null   float64
 7   seasons       2306 non-null   float64
 8   episodes      2306 non-null   float64
 9   type          6874 non-null   object 
dtypes: float64(3), int64(2), object(5)
memory usage: 537.2+ KB


In [9]:
df.nunique()

Unnamed: 0,0
hotstar_id,6874
title,6677
description,6815
genre,37
year,78
age_rating,6
running_time,187
seasons,40
episodes,495
type,2


#Top 5 Based on Action Movies

In [10]:
top5_action = df[df['genre'] == 'Action'].head(5)
top5_action

Unnamed: 0,hotstar_id,title,description,genre,year,age_rating,running_time,seasons,episodes,type
0,1000087439,Sambha - Aajcha Chawa,A young man sets off on a mission to clean up ...,Action,2012,U/A 16+,141.0,,,movie
13,1000228200,Maari 2,"Maari, a local don, in his new naughty-turned-...",Action,2018,U/A 13+,140.0,,,movie
14,1260109393,FIR,Irfan’s world comes crashing down when he gets...,Action,2022,U/A 16+,148.0,,,movie
22,1770015155,Pulimurugan,Murugan protects the villagers from deadly tig...,Action,2016,U/A 13+,153.0,,,movie
26,1260014568,Poysa Usul,RAW hires a convict to deal with a high-profil...,Action,2017,U/A 13+,125.0,,,movie


#Top Highest Run Time Movies

In [11]:
top_runtime = df[df['running_time'] > 185]
top_runtime

Unnamed: 0,hotstar_id,title,description,genre,year,age_rating,running_time,seasons,episodes,type
1628,1260117918,Daana Veera Soora Karna,"After facing many hardships in life, a valiant...",Fantasy,1977,U,226.0,,,movie
2969,1000101168,Kerala Varma Pazhassiraja,"Kerala Varma Pazhassiraja is a Malayalam film,...",Action,2009,U/A 13+,190.0,,,movie
3513,1260127955,The 2022 Rock & Roll Hall of Fame Induction Ce...,The Rock & Roll Hall of Fame inducts Pat Benat...,Concert Film,2022,U/A 16+,229.0,,,movie
3584,1770001166,Titanic,Rose is engaged to marry Caledon Hockley but f...,Historical,1997,U/A 13+,194.0,,,movie
4234,1000108860,Dubai,"After being framed for murder, Major Ravi move...",Thriller,2001,U/A 16+,199.0,,,movie
4322,1260117075,Lajja,"Tormented by her abusive husband, Vaidehi deci...",Drama,2001,U/A 16+,186.0,,,movie


#Recommending Movies Based on Genres

In [22]:
def recommend_movies_by_genre(dataframe, genre):

    return dataframe[dataframe['genre'] == genre]

# Get genre input from the user
genre_input = input("Enter the genre you want recommendations for: ")

# Check if the genre exists in the DataFrame
if genre_input in df['genre'].unique():
    recommended_movies = recommend_movies_by_genre(df, genre_input)
    print(f"\nRecommended movies for the genre '{genre_input}':")
    display(recommended_movies['title'])
else:
    print(f"\nSorry, the genre '{genre_input}' was not found in the dataset. Please try one of the following genres:")
    print(df['genre'].unique())

Enter the genre you want recommendations for: Adventure

Recommended movies for the genre 'Adventure':


Unnamed: 0,title
107,Night at the Museum: Kahmunrah Rises Again
140,Tooth Fairy 2
327,Pinocchio
342,Percy Jackson & The Olympians: The Lightning T...
542,Alice Through The Looking Glass
895,Home Alone
926,The Bob's Burgers Movie
1204,Herbie Rides Again
1275,The Chronicles Of Narnia: The Voyage Of The Da...
1455,Pete's Dragon


In [15]:
import pandas as pd
import plotly.express as px
import ipywidgets as widgets
from IPython.display import display

# Load dataset (adjust if filename is different)
df = pd.read_csv("/content/hotstar.csv")  # or use your exact uploaded file name

# # Clean data
# df['running_time'] = pd.to_numeric(df['running_time'], errors='coerce')
# df['year'] = pd.to_numeric(df['year'], errors='coerce')
# df = df.dropna(subset=['genre', 'running_time', 'year'])

# Plot 1: Genre Distribution
genre_counts = df['genre'].value_counts().reset_index()
genre_counts.columns = ['genre', 'titles_count']

fig1 = px.bar(genre_counts, x='genre', y='titles_count',
              title="Titles Count by Genre")
fig1.update_xaxes(tickangle=45)
fig1.show()

# Plot 2: Genre Trends Over Years
genre_year = df.groupby(['genre', 'year']).size().reset_index(name='titles_count')

fig2 = px.line(genre_year, x='year', y='titles_count', color='genre',
               title="Genre Trends Over Years")
fig2.update_layout(xaxis=dict(tickmode='linear'))
fig2.show()

# Plot 3: Running Time by Genre and Age Rating
fig3 = px.box(df, x='genre', y='running_time', color='age_rating',
              title="Running Time Distribution by Genre and Age Rating")
fig3.update_xaxes(tickangle=45)
fig3.show()

# Interactive Plot: Yearly Distribution for Selected Genre
genre_widget = widgets.Dropdown(
    options=sorted(df['genre'].unique()),
    description='Select Genre:',
)

def update_plot(genre):
    filtered = df[df['genre'] == genre]
    fig = px.histogram(filtered, x='year', title=f"Yearly Distribution for {genre}")
    fig.update_layout(xaxis=dict(tickmode='linear'))
    fig.show()

interactive_plot = widgets.interactive(update_plot, genre=genre_widget)
display(interactive_plot)


In [16]:
import pandas as pd
import plotly.express as px

# Load dataset
df = pd.read_csv("/content/hotstar.csv")  # or your actual file name

# Clean data: convert to numeric and fill missing values with 0
df['seasons'] = pd.to_numeric(df['seasons'], errors='coerce').fillna(0)
df['episodes'] = pd.to_numeric(df['episodes'], errors='coerce').fillna(0)
df['running_time'] = pd.to_numeric(df['running_time'], errors='coerce').replace(0, 1)  # avoid division by zero

#  Calculate binge_watch_score
df['binge_score'] = (df['seasons'] * df['episodes']) / df['running_time']

# Show top binge-worthy genres
top_binge = df.groupby('genre')['binge_score'].mean().sort_values(ascending=False).reset_index()

# Plot the top binge-watch genres
fig = px.bar(top_binge, x='genre', y='binge_score',
             title="Average Binge-Watch Score by Genre",
             labels={'binge_score': 'Average Binge Score', 'genre': 'Genre'})
fig.update_xaxes(tickangle=45)
fig.show()
