In [1]:
import numpy as np
import pandas as pd
import ast

In [2]:
# Load the data
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [3]:
# Parse dates with different formats
def parse_date(date_str):
    for fmt in ('%d/%m/%Y', '%Y-%m-%d'):
        try:
            return pd.to_datetime(date_str, format=fmt)
        except ValueError:
            pass
    return pd.NaT  # Return NaT if all formats fail

In [4]:
# Apply the function to the 'release_date' column
movies['release_date'] = movies['release_date'].apply(parse_date)

In [5]:
# Extract the year and handle NaT values by filling with a placeholder (e.g., 0)
movies['release_year'] = movies['release_date'].dt.year.fillna(0).astype(int)

In [6]:
# Strip column names
movies.columns = movies.columns.str.strip()

In [7]:
# Merge movies and credits DataFrames
movies = movies.merge(credits, on='title')

In [8]:
# Select necessary columns
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew', 'release_date']]

In [9]:
# Drop rows with missing values
movies.dropna(inplace=True)

In [10]:
# Convert JSON-like columns
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
       L.append(i['name'])
    return L

In [11]:
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)

In [12]:
def convertTop3(obj):
    L = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter != 3:
           L.append(i['name'])
           counter += 1
        else:
            break
    return L

In [13]:
movies['cast'] = movies['cast'].apply(convertTop3)

In [14]:
movies['cast']

0        [Sam Worthington, Zoe Saldana, Sigourney Weaver]
1           [Johnny Depp, Orlando Bloom, Keira Knightley]
2            [Daniel Craig, Christoph Waltz, Léa Seydoux]
3            [Christian Bale, Michael Caine, Gary Oldman]
4          [Taylor Kitsch, Lynn Collins, Samantha Morton]
                              ...                        
4804    [Carlos Gallardo, Jaime de Hoyos, Peter Marqua...
4805         [Edward Burns, Kerry Bishé, Marsha Dietlein]
4806           [Eric Mabius, Kristin Booth, Crystal Lowe]
4807            [Daniel Henney, Eliza Coupe, Bill Paxton]
4808    [Drew Barrymore, Brian Herzlinger, Corey Feldman]
Name: cast, Length: 4805, dtype: object

In [15]:
def fetch_director(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
           L.append(i['name'])
           break
    return L

In [16]:
movies['crew'] = movies['crew'].apply(fetch_director)

In [17]:
# Split 'overview' into words
movies['overview'] = movies['overview'].apply(lambda x: x.split())

In [18]:
# Remove spaces in 'genres', 'keywords', 'cast', and 'crew'
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ", "") for i in x])

In [19]:
# Create 'tags' column
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [21]:
# Convert 'tags' to string using .loc
new_df = movies[['movie_id', 'title', 'release_date', 'genres', 'cast', 'tags']]
new_df.loc[:, 'tags'] = new_df['tags'].apply(lambda x: " ".join(x))

In [23]:
# Convert 'release_date' to datetime and extract the year using .loc
new_df.loc[:, 'release_date'] = pd.to_datetime(new_df['release_date'], format='%d/%m/%Y', errors='coerce')
new_df.loc[:, 'release_year'] = new_df['release_date'].dt.year.fillna(0).astype(int)

In [26]:
# Cast 'release_year' values to string before setting them in the DataFrame
new_df.loc[:, 'release_year'] = new_df['release_year'].astype(str)

# Add 'release_year' to 'tags' using .loc
new_df.loc[:, 'tags'] = new_df['tags'] + ' ' + new_df['release_year']

In [27]:
# Lowercase 'tags' using .loc
new_df.loc[:, 'tags'] = new_df['tags'].apply(lambda x: x.lower())

In [28]:
# Display the updated DataFrame
print(new_df.head())

   movie_id                                     title release_date  \
0     19995                                    Avatar   2009-12-10   
1       285  Pirates of the Caribbean: At World's End   2007-05-19   
2    206647                                   Spectre   2015-10-26   
3     49026                     The Dark Knight Rises   2012-07-16   
4     49529                               John Carter   2012-03-07   

                                         genres  \
0  [Action, Adventure, Fantasy, ScienceFiction]   
1                  [Adventure, Fantasy, Action]   
2                    [Action, Adventure, Crime]   
3              [Action, Crime, Drama, Thriller]   
4           [Action, Adventure, ScienceFiction]   

                                            cast  \
0  [SamWorthington, ZoeSaldana, SigourneyWeaver]   
1     [JohnnyDepp, OrlandoBloom, KeiraKnightley]   
2      [DanielCraig, ChristophWaltz, LéaSeydoux]   
3      [ChristianBale, MichaelCaine, GaryOldman]   
4    [Taylor

In [29]:
import pickle

In [30]:
pickle.dump(new_df.to_dict(),open('movies_dict.pkl','wb'))

In [32]:
new_df['release_year']

0       2009
1       2007
2       2015
3       2012
4       2012
        ... 
4804    1992
4805    2011
4806    2013
4807    2012
4808    2005
Name: release_year, Length: 4805, dtype: object