In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.preprocessing import MultiLabelBinarizer
import unicodedata
import re

In [2]:
# Load the Rotten Tomatoes dataset (adjust the filename if needed)
rt_df = pd.read_csv('RottenTomatoes/rotten_tomatoes_movies.csv')

# Display first few rows
rt_df.head()


Unnamed: 0,rotten_tomatoes_link,movie_title,movie_info,critics_consensus,content_rating,genres,directors,authors,actors,original_release_date,...,production_company,tomatometer_status,tomatometer_rating,tomatometer_count,audience_status,audience_rating,audience_count,tomatometer_top_critics_count,tomatometer_fresh_critics_count,tomatometer_rotten_critics_count
0,m/0814255,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",Though it may seem like just another Harry Pot...,PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,"Craig Titley, Chris Columbus, Rick Riordan","Logan Lerman, Brandon T. Jackson, Alexandra Da...",2010-02-12,...,20th Century Fox,Rotten,49.0,149.0,Spilled,53.0,254421.0,43,73,76
1,m/0878835,Please Give,Kate (Catherine Keener) and her husband Alex (...,Nicole Holofcener's newest might seem slight i...,R,Comedy,Nicole Holofcener,Nicole Holofcener,"Catherine Keener, Amanda Peet, Oliver Platt, R...",2010-04-30,...,Sony Pictures Classics,Certified-Fresh,87.0,142.0,Upright,64.0,11574.0,44,123,19
2,m/10,10,"A successful, middle-aged Hollywood songwriter...",Blake Edwards' bawdy comedy may not score a pe...,R,"Comedy, Romance",Blake Edwards,Blake Edwards,"Dudley Moore, Bo Derek, Julie Andrews, Robert ...",1979-10-05,...,Waner Bros.,Fresh,67.0,24.0,Spilled,53.0,14684.0,2,16,8
3,m/1000013-12_angry_men,12 Angry Men (Twelve Angry Men),Following the closing arguments in a murder tr...,Sidney Lumet's feature debut is a superbly wri...,NR,"Classics, Drama",Sidney Lumet,Reginald Rose,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",1957-04-13,...,Criterion Collection,Certified-Fresh,100.0,54.0,Upright,97.0,105386.0,6,54,0
4,m/1000079-20000_leagues_under_the_sea,"20,000 Leagues Under The Sea","In 1866, Professor Pierre M. Aronnax (Paul Luk...","One of Disney's finest live-action adventures,...",G,"Action & Adventure, Drama, Kids & Family",Richard Fleischer,Earl Felton,"James Mason, Kirk Douglas, Paul Lukas, Peter L...",1954-01-01,...,Disney,Fresh,89.0,27.0,Upright,74.0,68918.0,5,24,3


In [3]:
# Create an explicit copy to avoid SettingWithCopyWarning
rt_df_cleaned = rt_df[[
    "movie_title", "genres",
    "tomatometer_rating", "tomatometer_count",
    "audience_rating", "audience_count",
    "tomatometer_fresh_critics_count", "tomatometer_rotten_critics_count"
]].copy()


In [4]:
# Convert columns to numeric safely
rt_df_cleaned.loc[:, "tomatometer_rating"] = pd.to_numeric(rt_df_cleaned["tomatometer_rating"], errors="coerce")
rt_df_cleaned.loc[:, "audience_rating"] = pd.to_numeric(rt_df_cleaned["audience_rating"], errors="coerce")
rt_df_cleaned.loc[:, "tomatometer_count"] = pd.to_numeric(rt_df_cleaned["tomatometer_count"], errors="coerce").fillna(0).astype(int)
rt_df_cleaned.loc[:, "audience_count"] = pd.to_numeric(rt_df_cleaned["audience_count"], errors="coerce").fillna(0).astype(int)

rt_df_cleaned["tomatometer_rating"] = rt_df_cleaned["tomatometer_rating"].fillna(rt_df_cleaned["tomatometer_rating"].median())
rt_df_cleaned["audience_rating"] = rt_df_cleaned["audience_rating"].fillna(rt_df_cleaned["audience_rating"].median())

# Fill missing genres
rt_df_cleaned["genres"] = rt_df_cleaned["genres"].fillna("Unknown")

In [7]:
# Standardize movie title function
# def clean_movie_title(title):
#     if pd.isna(title):
#         return ""
#     title = str(title).strip().lower()  # Convert to lowercase and remove leading/trailing spaces
#     title = unicodedata.normalize('NFKD', title).encode('ASCII', 'ignore').decode('utf-8')  # Remove accents
#     title = re.sub(r'[^a-zA-Z0-9\s]', '', title)  # Remove special characters
#     return title
def normalize_text(text):
    if pd.isna(text):  # Handle missing values
        return ""
    text = str(text)
    text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('utf-8')  # Remove accents
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    return text.strip()

rt_df_cleaned["movieTitle"] = rt_df_cleaned["movie_title"].apply(normalize_text)

# Drop the 'movie_title' column
rt_df_cleaned = rt_df_cleaned.drop(columns=["movie_title"])

# Move 'movieTitle' to the first column position
cols = list(rt_df_cleaned.columns)
cols.insert(0, cols.pop(cols.index("movieTitle")))
rt_df_cleaned = rt_df_cleaned[cols]

In [8]:
rt_df_cleaned.head()

Unnamed: 0,movieTitle,genres,tomatometer_rating,tomatometer_count,audience_rating,audience_count,tomatometer_fresh_critics_count,tomatometer_rotten_critics_count
0,Percy Jackson the Olympians The Lightning Thief,"Action & Adventure, Comedy, Drama, Science Fic...",49.0,149.0,53.0,254421.0,73,76
1,Please Give,Comedy,87.0,142.0,64.0,11574.0,123,19
2,10,"Comedy, Romance",67.0,24.0,53.0,14684.0,16,8
3,12 Angry Men Twelve Angry Men,"Classics, Drama",100.0,54.0,97.0,105386.0,54,0
4,20000 Leagues Under The Sea,"Action & Adventure, Drama, Kids & Family",89.0,27.0,74.0,68918.0,24,3


In [10]:
rt_df_cleaned.to_csv("cleaned/rottentomatoMovies2.csv", index=False)

In [9]:
print(rt_df_cleaned.dtypes)

movieTitle                           object
genres                               object
tomatometer_rating                  float64
tomatometer_count                   float64
audience_rating                     float64
audience_count                      float64
tomatometer_fresh_critics_count       int64
tomatometer_rotten_critics_count      int64
dtype: object
