In [None]:
import numpy as np
from IPython.display import Image, display
import pandas as pd
import sqlalchemy as sa
import requests
import json

from pandas.core.interchange.dataframe_protocol import DataFrame


movies_df = pd.read_csv('../../dataset/movies.csv').set_index('id')
actors_df = pd.read_csv('../../dataset/actors.csv').set_index('id')
countries_df = pd.read_csv('../../dataset/countries.csv').set_index('id')
crew_df = pd.read_csv('../../dataset/crew.csv').set_index('id')
genres_df = pd.read_csv('../../dataset/genres.csv').set_index('id')
languages_df = pd.read_csv('../../dataset/languages.csv').set_index('id')
posters_df = pd.read_csv('../../dataset/posters.csv').set_index('id')
releases_df = pd.read_csv('../../dataset/releases.csv').set_index('id')
studios_df = pd.read_csv('../../dataset/studios.csv').set_index('id')
themes_df = pd.read_csv('../../dataset/themes.csv').set_index('id')

movies_df

Languages dataframe normalization and sql insertion

# 2.0 Data Analysis
## 2.1 Movies Dataframe Analysis

In [None]:
movies_df

#### 2.1.1 - Analyze the distribution of ratings to identify trends.

In [None]:
from matplotlib import pyplot as plt

# Filter nulls and outliers for 'rating'
filtered_df = movies_df[movies_df['rating'].notnull()]
q1, q3 = filtered_df['rating'].quantile([0.25, 0.75])
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
filtered_df = filtered_df[(filtered_df['rating'] >= lower_bound) & (filtered_df['rating'] <= upper_bound)]

print(f"Removed {len(movies_df) - len(filtered_df)} rows with nulls or outliers in 'rating'.")

# Plot
filtered_df['rating'].plot(kind='hist', bins=10, title='Rating Distribution', edgecolor='black')
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

#### 2.1.2 - Find the shortest and longest movies, analyze the distribution of movie durations, investigate the relationship between duration and rating. We can see evident outliers, obviously null values can interfere.

In [None]:
print(f"Shortest movie: {movies_df['minute'].min()} minutes")
print(f"Longest movie: {movies_df['minute'].max()} minutes")
print(f"Average duration: {movies_df['minute'].mean():.2f} minutes")

movies_df['minute'].plot(kind='box', vert=False, title='Movie Duration Distribution')
plt.xlabel('Duration (minutes)')
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.show()

#### 2.1.2.1 - If we remove those elements we can see a more accurately plot.

In [None]:
# Filter nulls and outliers for 'minute'
filtered_df = movies_df[movies_df['minute'].notnull()]
q1, q3 = filtered_df['minute'].quantile([0.25, 0.75])
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
filtered_df = filtered_df[(filtered_df['minute'] >= lower_bound) & (filtered_df['minute'] <= upper_bound)]

print(f"Removed {len(movies_df) - len(filtered_df)} rows with nulls or outliers in 'minute'.")

# Summary
print(f"Shortest movie: {filtered_df['minute'].min()} minutes")
print(f"Longest movie: {filtered_df['minute'].max()} minutes")
print(f"Average duration: {filtered_df['minute'].mean():.2f} minutes")

# Plot
filtered_df['minute'].plot(kind='box', vert=False, title='Movie Duration Distribution')
plt.xlabel('Duration (minutes)')
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.show()

#### 2.1.3 - Investigate how the duration of a movie (minute) correlates with its rating (rating). This can reveal if longer movies tend to have better ratings.

In [None]:
# Filter nulls and outliers for 'minute' and 'rating'
filtered_df = movies_df.dropna(subset=['minute', 'rating'])
q1_min, q3_min = filtered_df['minute'].quantile([0.25, 0.75])
iqr_min = q3_min - q1_min
lower_min = q1_min - 1.5 * iqr_min
upper_min = q3_min + 1.5 * iqr_min

q1_rat, q3_rat = filtered_df['rating'].quantile([0.25, 0.75])
iqr_rat = q3_rat - q1_rat
lower_rat = q1_rat - 1.5 * iqr_rat
upper_rat = q3_rat + 1.5 * iqr_rat

filtered_df = filtered_df[(filtered_df['minute'] >= lower_min) & (filtered_df['minute'] <= upper_min) &
                          (filtered_df['rating'] >= lower_rat) & (filtered_df['rating'] <= upper_rat)]

print(f"Removed rows with nulls or outliers in 'minute' or 'rating': {len(movies_df) - len(filtered_df)}.")

# Scatter plot
plt.scatter(filtered_df['minute'], filtered_df['rating'], alpha=0.5)
plt.title('Rating vs. Movie Duration')
plt.xlabel('Duration (minutes)')
plt.ylabel('Rating')
plt.grid(alpha=0.3)

# Add trend line
import numpy as np
fit = np.polyfit(filtered_df['minute'], filtered_df['rating'], 1)
plt.plot(filtered_df['minute'], fit[0] * filtered_df['minute'] + fit[1], color='red', alpha=0.7)
plt.show()

#### 2.1.4 - Analyze the variability in movie title lengths and identify movies with the shortest and longest titles.

In [None]:
# Filter nulls for 'name' and explicitly create a copy
filtered_df = movies_df[movies_df['name'].notnull()].copy()
filtered_df['title_length'] = filtered_df['name'].str.len()

# Remove outliers for title length
q1, q3 = filtered_df['title_length'].quantile([0.25, 0.75])
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
filtered_df = filtered_df[(filtered_df['title_length'] >= lower_bound) & (filtered_df['title_length'] <= upper_bound)]

print(f"Removed {len(movies_df) - len(filtered_df)} rows with nulls or outliers in 'title_length'.")

# Plot
filtered_df['title_length'].plot(kind='hist', bins=20, title='Distribution of Title Lengths', edgecolor='black')
plt.xlabel('Title Length (characters)')
plt.ylabel('Frequency')
plt.grid(alpha=0.3)
plt.show()

#### 2.1.5 - Explore potential correlations between numerical variables like minute and rating.

In [None]:
# Filter nulls and outliers for 'minute' and 'rating'
filtered_df = movies_df.dropna(subset=['minute', 'rating'])
correlation = filtered_df['rating'].corr(filtered_df['minute'])
print(f"Correlation between Rating and Movie Duration: {correlation:.2f}")

#### 2.1.6 - Identify the highest-rated and lowest-rated movies.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Filter valid rows for 'minute' and 'rating'
filtered_df = movies_df.dropna(subset=['minute', 'rating'])

# Define a threshold for long movies (e.g., movies longer than 120 minutes)
long_movies = filtered_df[filtered_df['minute'] > 120]

# Find the top 10 longest movies with the highest ratings
top_long_movies = long_movies.nlargest(10, 'rating')

# Display the results
print("Top 10 Longest Highly Rated Movies:")
print(top_long_movies[['name', 'minute', 'rating']])

# Plot the results
plt.figure(figsize=(10, 6))
plt.barh(top_long_movies['name'], top_long_movies['rating'], color='skyblue', edgecolor='black')
plt.xlabel('Rating')
plt.ylabel('Movie Name')
plt.title('Top 10 Longest Highly Rated Movies')
plt.gca().invert_yaxis()  # Invert y-axis for better readability
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# Filter nulls for 'rating'
filtered_df = movies_df[movies_df['rating'].notnull()]

# Identify worst movies

worst_movies = filtered_df.nsmallest(5, 'rating')

print("\nBottom 5 Movies:")
print(worst_movies[['name', 'rating']])

#### 2.1.7 - Explore the number of movies released per decade.

In [None]:
# Filter nulls for 'year' and 'rating'
movies_df['year'] = pd.to_datetime(movies_df['date'], errors='coerce').dt.year
filtered_df = movies_df.dropna(subset=['year', 'rating'])

# Group by year and calculate average rating
average_rating_per_year = filtered_df.groupby('year')['rating'].mean()

# Plot
plt.figure(figsize=(10, 6))
average_rating_per_year.plot(kind='line', marker='o', title='Average Movie Rating Over the Years')
plt.xlabel('Year')
plt.ylabel('Average Rating')
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

#### 2.1.8 -  Relationship Between Description Length and Movie Rating.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Calculate the number of words in each description
movies_df['description_length'] = movies_df['description'].dropna().str.split().apply(len)

# Filter rows with valid 'description_length' and 'rating'
filtered_df = movies_df.dropna(subset=['description_length', 'rating'])

# Calculate correlation
correlation = filtered_df['rating'].corr(filtered_df['description_length'])
print(f"Correlation between Description Length and Rating: {correlation:.2f}")

# Scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(filtered_df['description_length'], filtered_df['rating'], alpha=0.5, color='skyblue', edgecolor='black')
plt.title('Relationship Between Description Length and Movie Rating')
plt.xlabel('Description Length (number of words)')
plt.ylabel('Rating')
plt.grid(alpha=0.3)

# Add trend line
fit = np.polyfit(filtered_df['description_length'], filtered_df['rating'], 1)
plt.plot(filtered_df['description_length'], fit[0] * filtered_df['description_length'] + fit[1], color='red', alpha=0.7)
plt.tight_layout()
plt.show()

If the correlation is strong (positive or negative), it indicates a meaningful relationship between description length and rating.

#### 2.1.9 - Find movies with the most detailed or least detailed descriptions.

In [None]:
# Filter nulls for 'description' and explicitly create a copy
filtered_df = movies_df[movies_df['description'].notnull()].copy()
filtered_df['description_length'] = filtered_df['description'].str.len()

# Identify extremes
longest_descriptions = filtered_df.nlargest(5, 'description_length')
shortest_descriptions = filtered_df.nsmallest(5, 'description_length')

print("Movies with the Longest Descriptions:")
print(longest_descriptions[['name', 'description']])

print("\nMovies with the Shortest Descriptions:")
print(shortest_descriptions[['name', 'description']])

#### 2.1.10 - Explore the variability in the length of movie taglines and identify movies with the shortest or longest taglines.

In [None]:
# Filter nulls for 'tagline' and explicitly create a copy
filtered_df = movies_df[movies_df['tagline'].notnull()].copy()
filtered_df['tagline_length'] = filtered_df['tagline'].str.len()

# Plot
filtered_df['tagline_length'].plot(kind='hist', bins=20, title='Distribution of Tagline Lengths', edgecolor='black')
plt.xlabel('Tagline Length (characters)')
plt.ylabel('Frequency')
plt.grid(alpha=0.3)
plt.show()

## 2.2 Actors Dataframe Analysis

In [None]:
actors_df

#### 2.2.1 - Analyze the length of actor names and identify actors with the shortest and longest names.

In [None]:
# Calculate the length of each actor's name
actors_df['name_length'] = actors_df['name'].str.len()

# Remove outliers
q1, q3 = actors_df['name_length'].quantile([0.25, 0.75])
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
filtered_actors = actors_df[(actors_df['name_length'] >= lower_bound) & (actors_df['name_length'] <= upper_bound)].copy()

print(f"Removed {len(actors_df) - len(filtered_actors)} rows with nulls or outliers in 'name_length'.")

# Plot
filtered_actors['name_length'].plot(kind='hist', bins=20, title='Distribution of Actor Name Lengths', edgecolor='black')
plt.xlabel('Name Length (characters)')
plt.ylabel('Frequency')
plt.grid(alpha=0.3)
plt.show()

# Identify extremes
shortest_names = filtered_actors.nsmallest(5, 'name_length')
longest_names = filtered_actors.nlargest(5, 'name_length')

print("\nActors with the Shortest Names:")
print(shortest_names[['name', 'name_length']])

print("\nActors with the Longest Names:")
print(longest_names[['name', 'name_length']])

#### 2.2.2 - Identify the most common roles played by actors and their frequency.

In [None]:
# Count the frequency of each role
role_counts = actors_df['role'].value_counts()

# Plot the top 10 most common roles
role_counts.head(10).plot(kind='bar', figsize=(10, 6), title='Top 10 Most Common Roles')
plt.xlabel('Role')
plt.ylabel('Frequency')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

print("\nTop 10 Most Common Roles:")
print(role_counts.head(10))

#### 2.2.3 - Analyze the length of role descriptions and identify the shortest and longest role descriptions.

In [None]:
# Calculate the length of each role description
actors_df['role_length'] = actors_df['role'].str.len()

# Remove outliers
q1, q3 = actors_df['role_length'].quantile([0.25, 0.75])
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
filtered_actors = actors_df[(actors_df['role_length'] >= lower_bound) & (actors_df['role_length'] <= upper_bound)].copy()

print(f"Removed {len(actors_df) - len(filtered_actors)} rows with nulls or outliers in 'role_length'.")

# Plot
filtered_actors['role_length'].plot(kind='hist', bins=20, title='Distribution of Role Description Lengths', edgecolor='black')
plt.xlabel('Role Length (characters)')
plt.ylabel('Frequency')
plt.grid(alpha=0.3)
plt.show()

# Identify extremes
shortest_roles = filtered_actors.nsmallest(5, 'role_length')
longest_roles = filtered_actors.nlargest(5, 'role_length')

print("\nShortest Role Descriptions:")
print(shortest_roles[['role', 'role_length']])

print("\nLongest Role Descriptions:")
print(longest_roles[['role', 'role_length']])

#### 2.2.4 - Count the number of unique roles and display a few examples.

In [None]:
# Count unique roles
unique_roles = actors_df['role'].nunique()
print(f"Total number of unique roles: {unique_roles}")

# Display examples of unique roles
unique_role_examples = actors_df['role'].dropna().unique()[:10]
print("\nExamples of unique roles:")
print(unique_role_examples)

## 2.3 Crew Dataframe Analysis

In [None]:
crew_df.columns

#### 2.3.1 - Analyze the distribution of crew roles to see which roles are most common.

In [None]:
import matplotlib.pyplot as plt

# Remove nulls in 'role'
filtered_crew = crew_df.dropna(subset=['role'])

# Count the frequency of each role
role_counts = filtered_crew['role'].value_counts()

# Plot the top 10 most common roles
role_counts.head(10).plot(kind='bar', figsize=(10, 6), color='skyblue', edgecolor='black', title='Top 10 Crew Roles')
plt.xlabel('Role')
plt.ylabel('Frequency')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

print("\nTop 10 Most Common Roles:")
print(role_counts.head(10))

#### 2.3.2 - Determine the size of the crew for each movie and identify movies with the largest or smallest crews.

In [None]:
# Remove nulls in 'movie_id'
filtered_crew = crew_df.dropna(subset=['movie_id'])

# Count crew members per movie
crew_size = filtered_crew.groupby('movie_id').size()

# Remove outliers using IQR
q1, q3 = crew_size.quantile([0.25, 0.75])
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
filtered_crew_size = crew_size[(crew_size >= lower_bound) & (crew_size <= upper_bound)]

print(f"Removed {len(crew_size) - len(filtered_crew_size)} movies with crew sizes as outliers.")

# Plot the distribution of crew sizes
plt.figure(figsize=(10, 6))
filtered_crew_size.plot(kind='hist', bins=20, color='skyblue', edgecolor='black', title='Distribution of Crew Sizes per Movie')
plt.xlabel('Crew Size')
plt.ylabel('Frequency')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# Display movies with the largest and smallest crews
largest_crews = crew_size.nlargest(5)
smallest_crews = crew_size.nsmallest(5)

print("\nMovies with the Largest Crews:")
print(largest_crews)

print("\nMovies with the Smallest Crews:")
print(smallest_crews)

#### 2.3.3 - Identify crew members who appear most frequently across movies (e.g., prolific directors or producers).

In [None]:
# Remove nulls in 'name'
filtered_crew = crew_df.dropna(subset=['name'])

# Count appearances of each crew member
crew_member_counts = filtered_crew['name'].value_counts()

# Plot the top 10 most frequent crew members
crew_member_counts.head(10).plot(kind='bar', figsize=(10, 6), color='skyblue', edgecolor='black', title='Top 10 Most Frequent Crew Members')
plt.xlabel('Crew Member')
plt.ylabel('Frequency')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

print("\nTop 10 Most Frequent Crew Members:")
print(crew_member_counts.head(10))

#### 2.3.4 - Analyze the most common roles assigned in movies, highlighting trends in crew composition.

In [None]:
# Remove nulls in 'movie_id' and 'role'
filtered_crew = crew_df.dropna(subset=['movie_id', 'role'])

# Count roles per movie
common_roles_per_movie = filtered_crew.groupby(['movie_id', 'role']).size().reset_index(name='count')

# Find the most common role for each movie
most_common_role = common_roles_per_movie.loc[common_roles_per_movie.groupby('movie_id')['count'].idxmax()]

print("\nMost Common Roles by Movie:")
print(most_common_role.head(10))

#### 2.3.5 - Analyze the diversity of roles taken on by individual crew members.

In [None]:
# Remove nulls in 'name' and 'role'
filtered_crew = crew_df.dropna(subset=['name', 'role'])

# Count the number of unique roles for each crew member
crew_role_diversity = filtered_crew.groupby('name')['role'].nunique().sort_values(ascending=False)

# Plot the top 10 crew members with the most diverse roles
crew_role_diversity.head(10).plot(kind='bar', figsize=(10, 6), color='skyblue', edgecolor='black', title='Top 10 Crew Members with Most Diverse Roles')
plt.xlabel('Crew Member')
plt.ylabel('Number of Unique Roles')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

print("\nTop 10 Crew Members with the Most Diverse Roles:")
print(crew_role_diversity.head(10))

#### 2.3.6 - Analyze how frequently crew members collaborate on the same movies.

In [None]:
from itertools import combinations
from collections import Counter

# Remove nulls in 'movie_id' and 'name'
filtered_crew = crew_df.dropna(subset=['movie_id', 'name'])

# Create pairs of crew members for each movie
crew_collaborations = filtered_crew.groupby('movie_id')['name'].apply(lambda x: list(combinations(sorted(x), 2)))

# Flatten the list of pairs and count collaborations
collaboration_counts = Counter([pair for sublist in crew_collaborations for pair in sublist])

# Convert to a DataFrame
collaboration_df = pd.DataFrame(collaboration_counts.items(), columns=['Pair', 'Count']).sort_values(by='Count', ascending=False)

print("\nTop 10 Crew Collaborations:")
print(collaboration_df.head(10))

## 2.4 Languages Dataframe Analysis

In [None]:
languages_df

In [None]:
languages_df.columns

#### 2.4.1 - Analyze the distribution of languages in movies to see which are most commonly used.

In [None]:
import matplotlib.pyplot as plt

# Remove nulls in 'language'
filtered_languages = languages_df.dropna(subset=['language'])

# Count the frequency of each language
language_counts = filtered_languages['language'].value_counts()

# Plot the top 10 most common languages
language_counts.head(10).plot(kind='bar', figsize=(10, 6), color='skyblue', edgecolor='black', title='Top 10 Most Common Languages in Movies')
plt.xlabel('Language')
plt.ylabel('Frequency')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

print("\nTop 10 Most Common Languages:")
print(language_counts.head(10))

#### 2.4.2 - Explore how different types of languages (e.g., “Spoken”, “Subtitled”) are distributed in movies.

In [None]:
# Remove nulls in 'type'
filtered_types = languages_df.dropna(subset=['type'])

# Count the frequency of each type
type_counts = filtered_types['type'].value_counts()

# Plot the distribution of language types
type_counts.plot(kind='bar', figsize=(10, 6), color='orange', edgecolor='black', title='Distribution of Language Types')
plt.xlabel('Type')
plt.ylabel('Frequency')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

print("\nDistribution of Language Types:")
print(type_counts)

#### 2.4.3 - Calculate the average number of languages associated with each type (Spoken, Subtitled, etc.) and compare the results.

In [None]:
# Remove nulls in 'type' and 'movie_id'
filtered_languages = languages_df.dropna(subset=['type', 'movie_id'])

# Calculate the number of languages per movie
languages_per_movie = filtered_languages.groupby('movie_id').size().rename('language_count')  # Ensure the Series has a name

# Merge the language counts with the type of language
movie_types = filtered_languages[['movie_id', 'type']].drop_duplicates()
languages_with_type = pd.merge(movie_types, languages_per_movie, left_on='movie_id', right_index=True, how='inner')

# Calculate the average number of languages per type
average_languages_per_type = languages_with_type.groupby('type')['language_count'].mean().sort_values(ascending=False)

# Plot the results
plt.figure(figsize=(10, 6))
average_languages_per_type.plot(kind='bar', color='purple', edgecolor='black', title='Average Number of Languages Per Type')
plt.xlabel('Type')
plt.ylabel('Average Number of Languages')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

print("\nAverage Number of Languages Per Type:")
print(average_languages_per_type)

#### 2.4.4 - Identify the most frequent combinations of languages used together in movies.

In [None]:
from itertools import combinations
from collections import Counter

# Group languages by movie and find combinations
filtered_languages = languages_df.dropna(subset=['movie_id', 'language'])
language_combinations = filtered_languages.groupby('movie_id')['language'].apply(lambda x: list(combinations(sorted(x), 2)))

# Flatten the list of combinations and count frequencies
combination_counts = Counter([pair for sublist in language_combinations for pair in sublist])

# Convert to DataFrame
combination_df = pd.DataFrame(combination_counts.items(), columns=['Combination', 'Count']).sort_values(by='Count', ascending=False)

print("\nTop 10 Most Common Language Combinations:")
print(combination_df.head(10))

## 2.5 Dataframe Analysis

In [None]:
posters_df

## 2.6 Releases Dataframe Analysis

In [None]:
releases_df.columns

 #### 2.6.1 - Analyze which countries have the most releases.

In [None]:
import matplotlib.pyplot as plt

# Remove nulls in 'country'
filtered_releases = releases_df.dropna(subset=['country'])

# Count the number of releases per country
country_counts = filtered_releases['country'].value_counts()

# Plot the top 10 countries with the most releases
country_counts.head(10).plot(kind='bar', figsize=(10, 6), color='skyblue', edgecolor='black', title='Top 10 Countries by Number of Releases')
plt.xlabel('Country')
plt.ylabel('Number of Releases')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

print("\nTop 10 Countries by Number of Releases:")
print(country_counts.head(10))

  #### 2.6.2 - Analyze how the number of releases has changed over the years.

In [None]:
# Ensure 'year' column exists
releases_df['year'] = pd.to_datetime(releases_df['date'], errors='coerce').dt.year

# Remove nulls in 'year'
filtered_releases = releases_df.dropna(subset=['year'])

# Count the number of releases per year
releases_per_year = filtered_releases.groupby('year').size()

# Plot the trends over time
plt.figure(figsize=(10, 6))
releases_per_year.plot(kind='line', marker='o', title='Number of Releases Over Time')
plt.xlabel('Year')
plt.ylabel('Number of Releases')
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

print("\nReleases Per Year:")
print(releases_per_year.tail(10))

   #### 2.6.3 - Analyze the distribution of release types (e.g., theatrical, streaming, etc.).

In [None]:
# Remove nulls in 'type'
filtered_releases = releases_df.dropna(subset=['type'])

# Count the frequency of each release type
type_counts = filtered_releases['type'].value_counts()

# Plot the distribution of release types
type_counts.plot(kind='bar', figsize=(10, 6), color='green', edgecolor='black', title='Distribution of Release Types')
plt.xlabel('Release Type')
plt.ylabel('Frequency')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

print("\nDistribution of Release Types:")
print(type_counts)

   #### 2.6.4 - Compare the average rating of movies across different release types.

In [None]:
# Remove nulls in 'type' and 'rating' and create a copy to avoid warnings
filtered_releases = releases_df.dropna(subset=['type', 'rating']).copy()

# Convert 'rating' to numeric, coercing errors to NaN
filtered_releases['rating'] = pd.to_numeric(filtered_releases['rating'], errors='coerce')

# Remove rows with invalid ratings
filtered_releases = filtered_releases.dropna(subset=['rating'])

# Calculate the average rating for each release type
average_rating_by_type = filtered_releases.groupby('type')['rating'].mean().sort_values(ascending=False)

# Plot the results
plt.figure(figsize=(10, 6))
average_rating_by_type.plot(kind='bar', color='purple', edgecolor='black', title='Average Rating by Release Type')
plt.xlabel('Release Type')
plt.ylabel('Average Rating')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

print("\nAverage Rating by Release Type:")
print(average_rating_by_type)

   #### 2.6.5 - Analyze release trends for a specific country over time.

In [None]:
# Ensure 'year' column exists
releases_df['year'] = pd.to_datetime(releases_df['date'], errors='coerce').dt.year

# Filter for a specific country (e.g., 'USA')
filtered_country = releases_df[releases_df['country'] == 'USA']

# Count the number of releases per year for the country
releases_per_year_country = filtered_country.groupby('year').size()

# Plot the trends over time for the selected country
plt.figure(figsize=(10, 6))
releases_per_year_country.plot(kind='line', marker='o', title='Number of Releases Over Time (USA)')
plt.xlabel('Year')
plt.ylabel('Number of Releases')
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

print("\nReleases Per Year (USA):")
print(releases_per_year_country.tail(10))