In [4]:
import pandas as pd

# Load dataset
df = pd.read_csv('../data/Film_Dataset.csv')

# Display basic info
print("Original Dataset:")
print(df.head())
print("\nDataset Info:")
print(df.info())
print("\nMissing Values:")
print(df.isnull().sum())

# ====================
# 1. Data Cleaning
# ====================
df_clean = df.dropna(subset=['Film_Name', 'Language', 'Category', 'Viewing_Month'])

# Convert Viewing_Month to datetime
df_clean.loc[:, 'Viewing_Month'] = pd.to_datetime(df_clean['Viewing_Month'], errors='coerce')

# ====================
# 2. Filter for relevant time period
# ====================
cutoff_date = pd.to_datetime('2025-12-31')
df_clean = df_clean[df_clean['Viewing_Month'] <= cutoff_date]

# ====================
# 3. Handle duplicate movies with different languages/categories
# ====================
df_clean.loc[:, 'Unique_Movie'] = (
    df_clean['Film_Name'] + ' | ' +
    df_clean['Language'] + ' | ' +
    df_clean['Category']
)

# ====================
# 4. Viewing aggregation
# ====================
viewing_summary = df_clean.groupby(['Language', 'Category']).size().reset_index(name='View_Count')

# ====================
# 5. Save cleaned dataset
# ====================
df_clean.to_csv('../data/Film_Dataset_Cleaned.csv', index=False)

# ====================
# 6. Display cleaned info
# ====================
print("\n✅ Cleaned Dataset Info:")
print(df_clean.info())
print("\n✅ Cleaned Dataset Head:")
print(df_clean.head())
print("\n✅ Viewing Summary (Language & Category):")
print(viewing_summary)


Original Dataset:
         Film_Name Release_Date     Category Language  Viewer_Rate  \
0  Chennai Express    9/12/2021      Romance    Hindi          4.5   
1   Mountain Trail    4/27/2020       Comedy  English          3.5   
2     Eternal Hope     7/4/2020      Romance  English          4.6   
3  Seoul Heartbeat    4/17/2020       Comedy  English          4.1   
4      Shadow Pact    2/24/2022  Documentary  English          4.3   

   Number_of_Views Viewing_Month  
0            36395       2022-09  
1            93162       2021-07  
2            98663       2021-04  
3           112635       2020-07  
4            29496       2023-05  

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 460 entries, 0 to 459
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Film_Name        460 non-null    object 
 1   Release_Date     460 non-null    object 
 2   Category         460 non-null    object 
 3