In [1]:
import pandas as pd
import numpy as np

In [2]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [3]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [4]:
credits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4803 non-null   int64 
 1   title     4803 non-null   object
 2   cast      4803 non-null   object
 3   crew      4803 non-null   object
dtypes: int64(1), object(3)
memory usage: 150.2+ KB


In [5]:
# movies - title, genres, keywords, overview, production_companies, tagline
# credits - castabs
# we can join 2 dataframes based on movie_id 

In [6]:
merged_df = pd.merge(movies, credits, left_on='id', right_on='movie_id', how='inner')

In [7]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [8]:
df = merged_df[['id', 'genres', 'keywords', 'overview', 'title_x', 'cast', 'tagline', 'crew', 'production_companies']]

In [9]:
df = df.rename(columns = {'title_x':'title'})

In [10]:
df.loc[0, 'genres']

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [11]:
import ast 
def get_names(value):
    x = ast.literal_eval(value)
    keywords = []
    for obj in x:
        name = obj["name"].replace(" ", "_").lower().strip()
        if name not in keywords:
            keywords.append(name)
    return ",".join(keywords) 

In [12]:
get_names(df.loc[0, "genres"])

'action,adventure,fantasy,science_fiction'

In [13]:
df["genres"] = df["genres"].apply(get_names)

In [14]:
df["keywords"] = df["keywords"].apply(get_names)

In [15]:
df["cast"] = df["cast"].apply(get_names)

In [16]:
df["crew"] = df["crew"].apply(get_names)

In [17]:
df["production_companies"] = df["production_companies"].apply(get_names)

In [18]:
df["overview"] = df["overview"].apply(lambda x:str(x).replace(",", "").replace(" ", ",").lower())

In [19]:
df["tagline"] = df["tagline"].apply(lambda x:str(x).replace(",", "").replace(" ", ",").lower())

In [20]:
df["tags"] = df["genres"] + "," + df["keywords"] + "," + df["overview"] + "," + df["cast"] + "," + df["tagline"] + "," + df["crew"] + df["production_companies"]

In [21]:
final_df = df[["id", "title", "tags"]]

In [22]:
final_df = final_df.dropna()

In [23]:
final_df.drop_duplicates(inplace = True)

In [24]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      4803 non-null   int64 
 1   title   4803 non-null   object
 2   tags    4803 non-null   object
dtypes: int64(1), object(2)
memory usage: 112.7+ KB


In [25]:
# NLTK library
# Natural Language Tool Kit

In [26]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# Download stopwords if not already available
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to C:\Users\Sahil Patil
[nltk_data]     Pc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Sahil Patil
[nltk_data]     Pc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\Sahil Patil
[nltk_data]     Pc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [27]:
stop_words = set(stopwords.words('english'))
print(stop_words)

{'doesn', 'against', 'on', 'y', 'didn', 'this', "they're", 'having', 't', 'mightn', 'no', "weren't", 'both', 'yourself', 'mustn', 'these', 'only', 'just', 'some', "haven't", 'herself', 'do', 'how', "i'd", "needn't", 'so', 'll', 'own', "he's", 'wasn', 'being', 'while', 'after', 'am', "they'd", 'd', "doesn't", 'that', "hadn't", 'will', 'they', 'not', 'over', 'through', 'off', 'should', "you'd", 'she', 'further', 'be', 'than', 'during', 'there', 'into', "i've", "they'll", "he'll", 'about', 'themselves', 'between', 'other', 'at', 'his', 'by', 'ours', 'myself', 'such', 'weren', 's', 'i', "they've", 'shouldn', 'hers', "shan't", 'have', 'is', 'before', 'has', 'few', "you're", 'and', "couldn't", 'haven', 'he', "mustn't", 'as', 'again', 'itself', "you'll", 'me', 'where', "i'll", 'down', "that'll", 'yourselves', "aren't", 'shan', 'did', 'all', 'needn', 'can', 'the', "mightn't", "we've", 'it', 'our', 'nor', 'yours', 'more', 'm', 'isn', "it'd", 'ma', 'theirs', 'are', 'doing', 'won', 'you', 'most',

In [28]:
def remove_stop_words(text):
    # Get the English stopwords list
    stop_words = set(stopwords.words('english'))
    # Tokenize the sentence
    words = text.split(',')
    # Remove stopwords
    filtered_words = [word for word in words if word.lower() not in stop_words]
    # # Convert back to string
    filtered_text = ",".join(filtered_words)
    filtered_text = filtered_text.replace(".", "")
    return filtered_text

In [29]:
final_df["tags"] = final_df["tags"].apply(remove_stop_words)

In [30]:
final_df.loc[0, 'tags']

"action,adventure,fantasy,science_fiction,culture_clash,future,space_war,space_colony,society,space_travel,futuristic,romance,space,alien,tribe,alien_planet,cgi,marine,soldier,battle,love_affair,anti_war,power_relations,mind_and_soul,3d,22nd,century,paraplegic,marine,dispatched,moon,pandora,unique,mission,becomes,torn,following,orders,protecting,alien,civilization,sam_worthington,zoe_saldana,sigourney_weaver,stephen_lang,michelle_rodriguez,giovanni_ribisi,joel_david_moore,cch_pounder,wes_studi,laz_alonso,dileep_rao,matt_gerald,sean_anthony_moran,jason_whyte,scott_lawrence,kelly_kilgour,james_patrick_pitt,sean_patrick_murphy,peter_dillon,kevin_dorman,kelson_henderson,david_van_horn,jacob_tomuri,michael_blain-rozgay,jon_curry,luke_hawker,woody_schultz,peter_mensah,sonia_yee,jahnel_curfman,ilram_choi,kyla_warren,lisa_roumain,debra_wilson,chris_mala,taylor_kibby,jodie_landau,julie_lamm,cullen_b_madden,joseph_brady_madden,frankie_torres,austin_wilson,sara_wilson,tamica_washington-miller,luc

In [31]:
final_df.to_csv("cleaned_data.csv")