In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv("TMKOC_Enhanced_Dataset_v2.0.csv")

df.head()

Unnamed: 0,episode no.,Episode_title,description,Episode_runtime,Released_on,Lead Cast Matches,Supporting Cast Matches,place_matches_cleaned,channel_title,publish_date,...,has_tapu,has_daya,has_taarak,has_bhide,has_champaklal,like_to_view_ratio,comment_to_view_ratio,comment_to_like_ratio,engagement_score,view_category
0,1,Taarak Mehta Introduces Himself And Society Me...,Taarak Mehta introduces himself and the societ...,21 mins,28-Jul-08,Mehta,"Anjali, Komal, Madhavi, Roshan",Gokuldham Society,Sony SAB,2017-06-12 10:28:16+00:00,...,False,False,False,False,False,0.013988,0.000514,0.036735,8.598616,High
1,2,Jethalal's Son Tapu Decides To Spend His Holiday,Jethalal's son Tapu decides to spend his holid...,18 mins,29-Jul-08,"Jethalal, Tapu","Anjali, Inspector, Komal, Madhavi, Roshan",Gokuldham Society,Sony SAB,2017-06-12 10:29:12+00:00,...,True,False,False,False,False,0.008341,0.000229,0.027487,5.096581,High
2,3,Tapu Has Been The Worry Of Jethalal,"More than the society members, Tapu has been t...",19 mins,30-Jul-08,"Jethalal, Tapu","Abdul, Anjali, Bagha, Komal, Madhavi, Nattu, R...",Gokuldham Society,Sony SAB,2017-06-13 08:50:12+00:00,...,True,False,False,False,False,0.00828,0.000259,0.031327,5.072004,High
3,4,Daya Approaches Taarak Mehta For Help,Tapu lands Jethalal Gada into yet another trou...,18 mins,31-Jul-08,"Daya, Mehta","Abdul, Bagha, Inspector, Nattu",Gokuldham Society,Sony SAB,2017-06-13 08:50:30+00:00,...,False,True,False,False,False,0.007839,0.000175,0.022348,4.773243,High
4,5,Jethalal In Huge Trouble,Tapu has decided to take full revenge from his...,18 mins,01-Aug-08,Jethalal,Inspector,Gokuldham Society,Sony SAB,2017-06-14 09:51:07+00:00,...,False,False,False,False,False,0.008267,0.000189,0.0229,5.036182,High


In [4]:
# Create log-transformed columns for highly skewed data for better visualization
df['log_view_count'] = np.log1p(df['view_count'])
df['log_like_count'] = np.log1p(df['like_count'])
df['log_engagement_score'] = np.log1p(df['engagement_score'])

In [5]:
# --- 1. Plot Class Balance (view_category) ---
plt.figure(figsize=(8, 6))
sns.countplot(data=df, x='view_category', order=df['view_category'].value_counts().index, palette='viridis')
plt.title('Class Balance of View Category')
plt.xlabel('View Category')
plt.ylabel('Number of Episodes')
plt.savefig('view_category_balance.png')
plt.close()



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(data=df, x='view_category', order=df['view_category'].value_counts().index, palette='viridis')


In [6]:
# --- 2. Histograms for Data Distribution ---
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
fig.suptitle('Distribution of Key Numerical Features', fontsize=16)

# Histogram 1: Log View Count
sns.histplot(df['log_view_count'], kde=True, ax=axes[0, 0], color='skyblue')
axes[0, 0].set_title('Log(View Count + 1) Distribution')
axes[0, 0].set_xlabel('Log(View Count + 1)')
axes[0, 0].set_ylabel('Frequency')

# Histogram 2: Log Like Count
sns.histplot(df['log_like_count'], kde=True, ax=axes[0, 1], color='salmon')
axes[0, 1].set_title('Log(Like Count + 1) Distribution')
axes[0, 1].set_xlabel('Log(Like Count + 1)')
axes[0, 1].set_ylabel('Frequency')

# Histogram 3: Runtime Minutes
sns.histplot(df['runtime_minutes'], kde=True, ax=axes[1, 0], color='lightgreen')
axes[1, 0].set_title('Runtime Minutes Distribution')
axes[1, 0].set_xlabel('Runtime Minutes')
axes[1, 0].set_ylabel('Frequency')

# Histogram 4: Engagement Score
sns.histplot(df['engagement_score'], kde=True, ax=axes[1, 1], color='gold')
axes[1, 1].set_title('Engagement Score Distribution')
axes[1, 1].set_xlabel('Engagement Score')
axes[1, 1].set_ylabel('Frequency')

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.savefig('numerical_features_histograms.png')
plt.close()

In [7]:
# --- 2. Boxplots for Data Distribution (cont.) ---
fig, axes = plt.subplots(1, 2, figsize=(12, 6))
fig.suptitle('Boxplots of Log-Transformed Counts by View Category', fontsize=14)

# Boxplot 1: Log View Count by View Category
sns.boxplot(x='view_category', y='log_view_count', data=df, order=df['view_category'].value_counts().index, ax=axes[0], palette='viridis')
axes[0].set_title('Log(View Count + 1) by View Category')
axes[0].set_xlabel('View Category')
axes[0].set_ylabel('Log(View Count + 1)')

# Boxplot 2: Log Like Count by View Category
sns.boxplot(x='view_category', y='log_like_count', data=df, order=df['view_category'].value_counts().index, ax=axes[1], palette='viridis')
axes[1].set_title('Log(Like Count + 1) by View Category')
axes[1].set_xlabel('View Category')
axes[1].set_ylabel('Log(Like Count + 1)')

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.savefig('log_counts_boxplots.png')
plt.close()


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='view_category', y='log_view_count', data=df, order=df['view_category'].value_counts().index, ax=axes[0], palette='viridis')

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='view_category', y='log_like_count', data=df, order=df['view_category'].value_counts().index, ax=axes[1], palette='viridis')
