In [56]:
import pandas as pd
import numpy as np
import ast
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [57]:
# Load datasets
credits_df = pd.read_csv('credits.csv')
movies_df = pd.read_csv('movies.csv')

# Merge datasets
movies = movies_df.merge(credits_df, on='title')
# Keep required columns
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew', 'popularity']]

In [58]:
# Drop null values
movies.dropna(inplace=True)

# Convert stringified lists into Python lists
def convert_list(obj):
    return [i['name'] for i in ast.literal_eval(obj)]

movies['genres'] = movies['genres'].apply(convert_list)
movies['keywords'] = movies['keywords'].apply(convert_list)

def fetch_lead_actors(obj):
    return [i['name'] for i in ast.literal_eval(obj)[:3]]

def fetch_director(obj):
    return [i['name'] for i in ast.literal_eval(obj) if i['job'] == 'Director']

movies['cast'] = movies['cast'].apply(fetch_lead_actors)
movies['crew'] = movies['crew'].apply(fetch_director)

In [59]:
# Normalize popularity
scaler = MinMaxScaler()
movies['popularity_log'] = np.log1p(movies['popularity'])
movies['popularity_log_norm'] = scaler.fit_transform(movies[['popularity_log']])

In [60]:
# Remove spaces in tags
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ", "") for i in x])

In [61]:
def pre_processing(content):
    # Only keeping alphabets in the content 
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
    # Removing spaces and converting to lowercase
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    return [word for word in stemmed_content if not word in stopwords.words('english')]

In [62]:
movies['tags'] = movies['overview'].apply(pre_processing).apply(lambda x: ' '.join(x)) + ' ' + \
                    movies['genres'].apply(lambda x: ' '.join(x)) + ' ' + \
                    movies['keywords'].apply(lambda x: ' '.join(x)) + ' ' + \
                    movies['cast'].apply(lambda x: ' '.join(x)) + ' ' + \
                    movies['crew'].apply(lambda x: ' '.join(x))

In [63]:
final_movies_df = movies[['movie_id', 'title', 'overview', 'popularity_log_norm', 'tags']]

In [64]:
final_movies_df['tags'] = final_movies_df['tags'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_movies_df['tags'] = final_movies_df['tags'].str.lower()


In [65]:
final_movies_df.to_csv('processed_movies.csv', index=False)