In [3]:
import pandas as pd

# Load plot summaries
plot_summaries = pd.read_csv('data/plot_summaries.txt', sep='\t', header=None, names=['Wikipedia_movie_ID', 'Plot'])
print(plot_summaries.head())

   Wikipedia_movie_ID                                               Plot
0            23890098  Shlykov, a hard-working taxi driver and Lyosha...
1            31186339  The nation of Panem consists of a wealthy Capi...
2            20663735  Poovalli Induchoodan  is sentenced for six yea...
3             2231378  The Lemon Drop Kid , a New York City swindler,...
4              595909  Seventh-day Adventist Church pastor Michael Ch...


In [4]:
# Load metadata
metadata = pd.read_csv('data/movie.metadata.tsv', sep='\t', header=None, names=[
    'Wikipedia_movie_ID', 'Freebase_movie_ID', 'Movie_name', 'Release_date', 
    'Box_office_revenue', 'Runtime', 'Languages', 'Countries', 'Genres'
])
print(metadata.head())

   Wikipedia_movie_ID Freebase_movie_ID  \
0              975900         /m/03vyhn   
1             3196793         /m/08yl5d   
2            28463795        /m/0crgdbh   
3             9363483        /m/0285_cd   
4              261236         /m/01mrr1   

                                          Movie_name Release_date  \
0                                     Ghosts of Mars   2001-08-24   
1  Getting Away with Murder: The JonBenét Ramsey ...   2000-02-16   
2                                        Brun bitter         1988   
3                                   White Of The Eye         1987   
4                                  A Woman in Flames         1983   

   Box_office_revenue  Runtime                           Languages  \
0          14010832.0     98.0  {"/m/02h40lc": "English Language"}   
1                 NaN     95.0  {"/m/02h40lc": "English Language"}   
2                 NaN     83.0  {"/m/05f_3": "Norwegian Language"}   
3                 NaN    110.0  {"/m/02h40lc":

In [5]:
# Merge datasets
merged_data = pd.merge(plot_summaries, metadata, on='Wikipedia_movie_ID')
print(merged_data.head())

   Wikipedia_movie_ID                                               Plot  \
0            23890098  Shlykov, a hard-working taxi driver and Lyosha...   
1            31186339  The nation of Panem consists of a wealthy Capi...   
2            20663735  Poovalli Induchoodan  is sentenced for six yea...   
3             2231378  The Lemon Drop Kid , a New York City swindler,...   
4              595909  Seventh-day Adventist Church pastor Michael Ch...   

  Freebase_movie_ID          Movie_name Release_date  Box_office_revenue  \
0        /m/076w2lb          Taxi Blues   1990-09-07                 NaN   
1        /m/0gkz15s    The Hunger Games   2012-03-12         686533290.0   
2        /m/051zjwb          Narasimham         2000                 NaN   
3         /m/06xtz3  The Lemon Drop Kid   1951-03-08           2300000.0   
4         /m/02tqm5   A Cry in the Dark   1988-11-03           6908797.0   

   Runtime                           Languages  \
0    110.0    {"/m/06b_j": "Russian 

In [6]:
import ast

# Parse the Genres column
merged_data['Genres'] = merged_data['Genres'].apply(ast.literal_eval)
merged_data['Genres'] = merged_data['Genres'].apply(lambda x: [genre[1] for genre in x])

# Explode the Genres column into multiple rows
exploded_data = merged_data.explode('Genres')
print(exploded_data.head())

   Wikipedia_movie_ID                                               Plot  \
0            23890098  Shlykov, a hard-working taxi driver and Lyosha...   
0            23890098  Shlykov, a hard-working taxi driver and Lyosha...   
1            31186339  The nation of Panem consists of a wealthy Capi...   
1            31186339  The nation of Panem consists of a wealthy Capi...   
1            31186339  The nation of Panem consists of a wealthy Capi...   

  Freebase_movie_ID        Movie_name Release_date  Box_office_revenue  \
0        /m/076w2lb        Taxi Blues   1990-09-07                 NaN   
0        /m/076w2lb        Taxi Blues   1990-09-07                 NaN   
1        /m/0gkz15s  The Hunger Games   2012-03-12         686533290.0   
1        /m/0gkz15s  The Hunger Games   2012-03-12         686533290.0   
1        /m/0gkz15s  The Hunger Games   2012-03-12         686533290.0   

   Runtime                           Languages  \
0    110.0    {"/m/06b_j": "Russian Language"}  

In [7]:
# Drop rows with missing values
cleaned_data = exploded_data.dropna(subset=['Plot', 'Genres'])
print(cleaned_data.head())

   Wikipedia_movie_ID                                               Plot  \
0            23890098  Shlykov, a hard-working taxi driver and Lyosha...   
0            23890098  Shlykov, a hard-working taxi driver and Lyosha...   
1            31186339  The nation of Panem consists of a wealthy Capi...   
1            31186339  The nation of Panem consists of a wealthy Capi...   
1            31186339  The nation of Panem consists of a wealthy Capi...   

  Freebase_movie_ID        Movie_name Release_date  Box_office_revenue  \
0        /m/076w2lb        Taxi Blues   1990-09-07                 NaN   
0        /m/076w2lb        Taxi Blues   1990-09-07                 NaN   
1        /m/0gkz15s  The Hunger Games   2012-03-12         686533290.0   
1        /m/0gkz15s  The Hunger Games   2012-03-12         686533290.0   
1        /m/0gkz15s  The Hunger Games   2012-03-12         686533290.0   

   Runtime                           Languages  \
0    110.0    {"/m/06b_j": "Russian Language"}  

In [8]:
import re
import nltk
from nltk.corpus import stopwords

# Ensure nltk is installed
%pip install nltk

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.lower()  # Convert to lowercase
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

cleaned_data['Cleaned_Plot'] = cleaned_data['Plot'].apply(clean_text)

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\prave\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_data['Cleaned_Plot'] = cleaned_data['Plot'].apply(clean_text)


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)  # Limit to top 5000 features
X = vectorizer.fit_transform(cleaned_data['Cleaned_Plot']).toarray()
y = cleaned_data['Genres']