In [36]:
import os
import ast
import random
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [37]:
df = pd.read_csv('../../data/movie-review-data/plot-movies-reviews.csv')
df_tmdb = pd.read_csv('../../data/movie-data-csv/TMDB_movie_dataset_v11.csv', low_memory=False)

In [38]:
df.sample(1).T

Unnamed: 0,6695
id,future_weather
originalScore,"[nan, nan, '4/5', '2.5/4', '3.5/5', '3/5', 'B-..."
reviewText,"{'Nicholas Bell': 'For her feature film debut,..."
PositiveCount,13
NegativeCount,1
TotalReviews,14
title,Future Weather
releaseDateStreaming,2013-04-16
runtimeMinutes,100.0
genre,['drama']


In [39]:
df.shape

(17769, 14)

In [40]:
df_tmdb.sample(1).T

Unnamed: 0,753480
id,1317588
title,RevPro Presents CMLL Fantastica Mania UK 2024
vote_average,0.0
vote_count,0
status,Released
release_date,2024-05-19
revenue,0
runtime,0
adult,False
backdrop_path,/6dY3Xr6gnBOZFYC3JWUgj4pAiT5.jpg


In [41]:
df_tmdb.shape

(1269226, 24)

In [42]:
common_titles = set(df['title']) & set(df_tmdb['title'])
print(f"Number of common titles: {len(common_titles)}")

Number of common titles: 16273


In [43]:
# Merge df and df_tmdb on the 'title' column
df = df.merge(
    df_tmdb[['title', 'id']],  # keep only the tmdb 'id' and 'title'
    on='title',
    how='left'
)

# Rename 'id_y' (from tmdb) to 'tmdbId' and keep original df id as is
df = df.rename(columns={'id_y': 'tmdbId', 'id_x': 'id'})
df.sample(1).T


Unnamed: 0,79066
id,victoria_2015
originalScore,"['4/5', '3.5/5', nan, '3.5/5', '8/10', nan, '3..."
reviewText,"{'Kat Halstead': 'Filmed in one shot, this low..."
PositiveCount,113
NegativeCount,24
TotalReviews,137
title,Victoria
releaseDateStreaming,2016-03-08
runtimeMinutes,138.0
genre,"['mystery & thriller', 'crime', 'drama']"


In [44]:
df.shape

(81865, 15)

In [45]:
df.isnull().sum()

id                          0
originalScore               0
reviewText                  0
PositiveCount               0
NegativeCount               0
TotalReviews                0
title                       0
releaseDateStreaming     6774
runtimeMinutes            845
genre                     538
originalLanguage         1446
director                  308
writer                  10600
plot                        0
tmdbId                    218
dtype: int64

In [46]:
df = df.dropna()
df.isnull().sum()

id                      0
originalScore           0
reviewText              0
PositiveCount           0
NegativeCount           0
TotalReviews            0
title                   0
releaseDateStreaming    0
runtimeMinutes          0
genre                   0
originalLanguage        0
director                0
writer                  0
plot                    0
tmdbId                  0
dtype: int64

In [47]:
df.shape

(65482, 15)

In [48]:
df = df.drop_duplicates(subset='id', keep='first')
df.shape

(14050, 15)

In [49]:
# Select a random index
random_index = random.randint(0, len(df) - 1)

# Fetch the row
random_row = df.iloc[random_index]

# Print the full row neatly
print(f"TMDB ID: {random_row['tmdbId']}")
print(f"Title: {random_row['title']}")
print(f"Streaming Release Date: {random_row['releaseDateStreaming']}")
print(f"Runtime (minutes): {random_row['runtimeMinutes']}")
print(f"Genre: {random_row['genre']}")
print(f"Language: {random_row['originalLanguage']}")
print(f"Director: {random_row['director']}")
print(f"Writer: {random_row['writer']}\n")
print(f"Review Texts: {random_row['reviewText']}\n")
print(f"Movie Plot: {random_row['plot']}\n")
print(f"Positive Count: {random_row['PositiveCount']}")
print(f"Negative Count: {random_row['NegativeCount']}")
print(f"Total Review Count: {random_row['TotalReviews']}")

TMDB ID: 3114.0
Title: The Searchers
Streaming Release Date: 2006-06-06
Runtime (minutes): 119.0
Genre: ['western']
Language: English
Director: John Ford
Writer: Alan Le May,Frank S. Nugent

Review Texts: {'Edward Porter': '... Its boldness is compelling.', 'Victor Pineyro': 'A road movie disguised as a western. Ford deals with revenge, redemption, love, violence and obsession in a beautifully shot film by veteran cinematographer William C. Hoch. Full review in Spanish', 'Francois Truffaut': 'John Ford symbolizes an age of Hollywood, the one when good health prevailed over intelligence, craftiness over sincerity. This age has gone; Elia Kazan’s and Nicholas Ray’s movies make more money than John Ford’s, poetry triumphs over entertainment.', 'Brian Eggert': "The motion picture Ford considered his own masterpiece confronts prior standards, meets issues of revenge and discrimination within a ponderous text, and revises the director's Western model forevermore.", 'Don Shanahan': 'Gorgeous 

In [50]:
df.to_csv('../../data/final-data/data.csv', index=False)