In [None]:
import pandas as pd

keywords_df = pd.read_csv('/content/keywords.csv.zip')
movies_metadata_df = pd.read_csv('/content/movies_metadata.csv.zip', low_memory=False)
credits_df = pd.read_csv('/content/credits.csv.zip')

In [None]:
print("Keywords DataFrame:")
display(keywords_df.head())
keywords_df.info()
display(keywords_df.describe())

print("\nMovies Metadata DataFrame:")
display(movies_metadata_df.head())
movies_metadata_df.info()
display(movies_metadata_df.describe())

print("\nCredits DataFrame:")
display(credits_df.head())
credits_df.info()
display(credits_df.describe())

In [None]:
print("Missing values percentage in keywords_df:")
display(keywords_df.isnull().sum() * 100 / len(keywords_df))

print("\nMissing values percentage in movies_metadata_df:")
display(movies_metadata_df.isnull().sum() * 100 / len(movies_metadata_df))

print("\nMissing values percentage in credits_df:")
display(credits_df.isnull().sum() * 100 / len(credits_df))

In [None]:
movies_metadata_df = movies_metadata_df.drop(columns=['belongs_to_collection', 'homepage'])
display(movies_metadata_df.isnull().sum() * 100 / len(movies_metadata_df))

In [None]:
# Convert columns to numeric, coercing errors
for col in ['budget', 'popularity', 'revenue', 'runtime', 'vote_average', 'vote_count']:
    movies_metadata_df[col] = pd.to_numeric(movies_metadata_df[col], errors='coerce')

# Impute missing values in movies_metadata_df after converting to numeric
movies_metadata_df['tagline'].fillna('Unknown', inplace=True)
movies_metadata_df['overview'].fillna('Unknown', inplace=True)
movies_metadata_df['poster_path'].fillna('Unknown', inplace=True)
movies_metadata_df['production_companies'].fillna('Unknown', inplace=True)
movies_metadata_df['production_countries'].fillna('Unknown', inplace=True)
movies_metadata_df['spoken_languages'].fillna('Unknown', inplace=True)
movies_metadata_df['status'].fillna('Unknown', inplace=True)
movies_metadata_df['release_date'].fillna(movies_metadata_df['release_date'].mode()[0], inplace=True)
movies_metadata_df['runtime'].fillna(movies_metadata_df['runtime'].mean(), inplace=True)
movies_metadata_df['revenue'].fillna(movies_metadata_df['revenue'].mean(), inplace=True)
movies_metadata_df['vote_average'].fillna(movies_metadata_df['vote_average'].mean(), inplace=True)
movies_metadata_df['vote_count'].fillna(movies_metadata_df['vote_count'].mean(), inplace=True)
movies_metadata_df['popularity'].fillna(movies_metadata_df['popularity'].mean(), inplace=True)
movies_metadata_df['imdb_id'].fillna('Unknown', inplace=True)
movies_metadata_df['original_language'].fillna(movies_metadata_df['original_language'].mode()[0], inplace=True)
movies_metadata_df['title'].fillna(movies_metadata_df['title'].mode()[0], inplace=True)
movies_metadata_df['video'].fillna(movies_metadata_df['video'].mode()[0], inplace=True)
movies_metadata_df['budget'].fillna(movies_metadata_df['budget'].mean(), inplace=True) # Impute budget after conversion

# Verify that missing values have been handled
print("\nMissing values percentage in movies_metadata_df after imputation:")
display(movies_metadata_df.isnull().sum() * 100 / len(movies_metadata_df))

print("\nMissing values percentage in keywords_df after checking:")
display(keywords_df.isnull().sum() * 100 / len(keywords_df))

print("\nMissing values percentage in credits_df after checking:")
display(credits_df.isnull().sum() * 100 / len(credits_df))

In [None]:
print("Number of duplicate rows in keywords_df:", keywords_df.duplicated().sum())
if keywords_df.duplicated().sum() > 0:
    keywords_df.drop_duplicates(inplace=True)
    print("Duplicate rows removed from keywords_df.")

print("\nNumber of duplicate rows in movies_metadata_df:", movies_metadata_df.duplicated().sum())
if movies_metadata_df.duplicated().sum() > 0:
    movies_metadata_df.drop_duplicates(inplace=True)
    print("Duplicate rows removed from movies_metadata_df.")

print("\nNumber of duplicate rows in credits_df:", credits_df.duplicated().sum())
if credits_df.duplicated().sum() > 0:
    credits_df.drop_duplicates(inplace=True)
    print("Duplicate rows removed from credits_df.")

print("\nNumber of rows after removing duplicates:")
print("keywords_df:", len(keywords_df))
print("movies_metadata_df:", len(movies_metadata_df))
print("credits_df:", len(credits_df))

In [None]:
movies_metadata_df['release_date'] = pd.to_datetime(movies_metadata_df['release_date'], errors='coerce')
movies_metadata_df['id'] = pd.to_numeric(movies_metadata_df['id'], errors='coerce')
keywords_df['id'] = pd.to_numeric(keywords_df['id'], errors='coerce')
credits_df['id'] = pd.to_numeric(credits_df['id'], errors='coerce')

movies_metadata_df.dropna(subset=['id'], inplace=True)
keywords_df.dropna(subset=['id'], inplace=True)
credits_df.dropna(subset=['id'], inplace=True)

print("Data types after conversion and dropping NaNs:")
movies_metadata_df.info()
keywords_df.info()
credits_df.info()

In [None]:
import ast

def safe_literal_eval(data):
    try:
        return ast.literal_eval(data)
    except (ValueError, SyntaxError):
        return []

for col in ['genres', 'production_companies', 'production_countries', 'spoken_languages']:
    movies_metadata_df[col] = movies_metadata_df[col].apply(safe_literal_eval)

for col in ['cast', 'crew']:
    credits_df[col] = credits_df[col].apply(safe_literal_eval)

keywords_df['keywords'] = keywords_df['keywords'].apply(safe_literal_eval)

print("DataFrames after applying safe_literal_eval:")
display(movies_metadata_df.head())
display(credits_df.head())
display(keywords_df.head())

In [None]:
def extract_names(list_of_dicts):
    if isinstance(list_of_dicts, list):
        return [d['name'] for d in list_of_dicts if isinstance(d, dict) and 'name' in d]
    return []

for col in ['genres', 'production_companies', 'production_countries', 'spoken_languages']:
    movies_metadata_df[col] = movies_metadata_df[col].apply(extract_names)

for col in ['cast', 'crew']:
    # For 'cast' and 'crew', we might want more than just the name, but the instruction
    # specifically asks for 'name'. Let's stick to the instruction for now.
    # A more detailed extraction would involve character/job for cast/crew respectively.
    credits_df[col] = credits_df[col].apply(extract_names)

keywords_df['keywords'] = keywords_df['keywords'].apply(extract_names)

print("DataFrames after extracting names:")
display(movies_metadata_df.head())
display(credits_df.head())
display(keywords_df.head())

In [None]:
numerical_cols = movies_metadata_df.select_dtypes(include=['number']).columns
print("Numerical columns in movies_metadata_df:", numerical_cols)

for col in numerical_cols:
    Q1 = movies_metadata_df[col].quantile(0.25)
    Q3 = movies_metadata_df[col].quantile(0.75)
    IQR = Q3 - Q1
    print(f"\nIQR for {col}: {IQR}")

In [None]:
for col in numerical_cols:
    Q1 = movies_metadata_df[col].quantile(0.25)
    Q3 = movies_metadata_df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    outliers = movies_metadata_df[(movies_metadata_df[col] < lower_bound) | (movies_metadata_df[col] > upper_bound)]
    print(f"\nNumber of outliers in {col}: {len(outliers)}")
    # print(f"Outlier range for {col}: ({lower_bound}, {upper_bound})") # Optional: display outlier range

In [None]:
movies_keywords = pd.merge(movies_metadata_df, keywords_df, on='id', how='inner')
merged_df = pd.merge(movies_keywords, credits_df, on='id', how='inner')

print("First few rows of the merged DataFrame:")
display(merged_df.head())

print("\nShape of the merged DataFrame:")
display(merged_df.shape)

In [None]:
# Convert list columns to a hashable type (tuple of strings) for duplicate checking
list_cols = ['genres', 'production_companies', 'production_countries', 'spoken_languages', 'keywords', 'cast', 'crew']
for col in list_cols:
    merged_df[col] = merged_df[col].apply(lambda x: tuple(x) if isinstance(x, list) else x)

num_duplicates = merged_df.duplicated().sum()
print(f"\nNumber of duplicate rows in merged_df after converting list columns: {num_duplicates}")

# Convert list columns back to list of strings for potential future use
for col in list_cols:
     merged_df[col] = merged_df[col].apply(lambda x: list(x) if isinstance(x, tuple) else x)

print("\nSummary statistics for numerical columns in merged_df:")
display(merged_df.describe())


In [None]:
merged_df.to_csv('cleaned_movies_data.csv', index=False)
print("Cleaned and merged DataFrame saved successfully to 'cleaned_movies_data.csv'")

In [None]:
merged_df['budget_to_revenue_ratio'] = merged_df['budget'] / merged_df['revenue']
merged_df['budget_to_revenue_ratio'].replace([float('inf'), float('-inf')], 0, inplace=True)

merged_df['release_year'] = merged_df['release_date'].dt.year

list_cols = ['genres', 'production_companies', 'production_countries', 'spoken_languages', 'keywords', 'cast', 'crew']
for col in list_cols:
    merged_df[f'num_{col}'] = merged_df[col].apply(len)

print("Merged DataFrame with new features:")
display(merged_df.head())

In [None]:
from collections import Counter

# Function to get all combinations of genres for a movie
def get_genre_combinations(genre_list):
    return tuple(sorted(genre_list))

# Apply the function and count the combinations
genre_combinations = merged_df['genres'].apply(get_genre_combinations)
genre_combination_counts = Counter(genre_combinations)

# Get the most common combinations
most_common_genre_combinations = genre_combination_counts.most_common(10)

print("Most common genre combinations:")
for combination, count in most_common_genre_combinations:
    print(f"{', '.join(combination) if combination else 'No Genres'}: {count}")

In [None]:
# Flatten the list of production companies and count their occurrences
all_production_companies = [company for sublist in merged_df['production_companies'] for company in sublist]
production_company_counts = Counter(all_production_companies)

# Get the most common production companies
most_common_production_companies = production_company_counts.most_common(20)

print("Most common production companies:")
for company, count in most_common_production_companies:
    print(f"{company}: {count}")

# Visualize the top production companies
top_companies_df = pd.DataFrame(most_common_production_companies, columns=['Company', 'Number of Movies'])

plt.figure(figsize=(12, 8))
sns.barplot(x='Number of Movies', y='Company', data=top_companies_df, palette='viridis')
plt.title('Top 20 Production Companies by Number of Movies')
plt.xlabel('Number of Movies')
plt.ylabel('Production Company')
plt.tight_layout()
plt.show()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Combine relevant text features into a single string
def combine_features(row):
    return ' '.join(row['genres']) + ' ' + ' '.join(row['keywords']) + ' ' + ' '.join(row['cast']) + ' ' + ' '.join(row['crew'])

merged_df['combined_features'] = merged_df.apply(combine_features, axis=1)

# Create a TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Fit and transform the combined features
tfidf_matrix = tfidf_vectorizer.fit_transform(merged_df['combined_features'])

print("TF-IDF matrix shape:", tfidf_matrix.shape)

# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

print("Cosine similarity matrix shape:", cosine_sim.shape)

In [None]:
# Function to get movie recommendations
def get_recommendations(movie_title, cosine_sim=cosine_sim, df=merged_df):
    """
    Generates movie recommendations based on cosine similarity.

    Args:
        movie_title (str): The title of the movie to get recommendations for.
        cosine_sim (numpy.ndarray): The cosine similarity matrix.
        df (pandas.DataFrame): The DataFrame containing movie information.

    Returns:
        pandas.DataFrame: A DataFrame of the top 5 recommended movies.
    """
    # Get the index of the movie that matches the title
    indices = df[df['title'] == movie_title].index

    if len(indices) == 0:
        print(f"Movie '{movie_title}' not found in the dataset.")
        return pd.DataFrame()

    # Assuming the first match is the desired movie
    idx = indices[0]

    # Get the pairwise similarity scores with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 6 most similar movies (excluding the movie itself)
    sim_scores = sim_scores[1:6]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 5 most similar movies
    return df[['title', 'genres', 'vote_average', 'vote_count']].iloc[movie_indices]

# Get recommendations for a sample movie (e.g., 'Toy Story') for user 'x'
sample_movie_for_user_x = 'Toy Story'
recommendations_for_user_x = get_recommendations(sample_movie_for_user_x)

print(f"Top 5 movie recommendations for user 'x' (based on liking '{sample_movie_for_user_x}'):")
display(recommendations_for_user_x)