Recommended Analysis
* What does the distribution of films look like by rating? My primary genre? (hint: use the first genre listed)
* What % of films received a Certified Fresh Tomatometer rating? What about Rotten?
* Explore new film releases over time. How has the volume of releases by month trended over time? What year/month were the newest films released?
* Compare average Tomatometer ratings by Studio. Which studios produce the highest-rated films, on average? The lowest?
* Compare the Tomatometer ratings against audience ratings. Which films showed the largest discrepancies between audiences and critics?
* Explore the critic's consensus rating: what language is used most often?


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df=pd.read_csv("/kaggle/input/movies-data/Rotten Tomatoes Movies 2.csv")
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
(df.isnull().sum()/df.shape[0])*100   # % of null values 

In [None]:
df.drop(columns=["critics_consensus"],inplace=True)

In [None]:
df.shape

In [None]:
sns.heatmap(df.isnull())
plt.show()

# 1.What does the distribution of films look like by rating? My primary genre? (hint: use the first genre listed)
# 

In [None]:
# Extracting the first genre listed
df['primary_genre'] = df['genre'].str.split(',').str[0].str.strip()

# Grouping by rating and primary genre, and counting the number of films in each group
rating_genre_distribution = df.groupby(['rating', 'primary_genre']).size().unstack(fill_value=0)
rating_genre_distribution



In [None]:

# Plotting the distribution
rating_genre_distribution.plot(kind='bar', figsize=(12, 6), stacked=True)
plt.title('Distribution of Films by Rating and Primary Genre')
plt.xlabel('Rating')
plt.ylabel('Number of Films')
plt.legend(title='Primary Genre')
plt.xticks(rotation=45)
plt.show()

# 2.What % of films received a Certified Fresh Tomatometer rating? What about Rotten?
# 

In [None]:
# Calculate the total number of films
total_films = len(df)

# Calculate the number of films with Certified Fresh rating
certified_fresh_count = df[df['tomatometer_status'] == 'Certified Fresh'].shape[0]

# Calculate the number of films with Rotten rating
rotten_count = df[df['tomatometer_status'] == 'Rotten'].shape[0]

# Calculate the percentage of films with Certified Fresh rating
certified_fresh_percentage = (certified_fresh_count / total_films) * 100

# Calculate the percentage of films with Rotten rating
rotten_percentage = (rotten_count / total_films) * 100

print("Percentage of films with Certified Fresh rating:", certified_fresh_percentage)
print("Percentage of films with Rotten rating:", rotten_percentage)


In [None]:
# Calculate the total number of films
total_films = len(df)

# Calculate the number of films with Certified Fresh rating
certified_fresh_count = df[df['tomatometer_status'] == 'Certified Fresh'].shape[0]

# Calculate the number of films with Rotten rating
rotten_count = df[df['tomatometer_status'] == 'Rotten'].shape[0]

# Calculate the percentages
certified_fresh_percentage = (certified_fresh_count / total_films) * 100
rotten_percentage = (rotten_count / total_films) * 100

# Create a pie chart
labels = ['Certified Fresh', 'Rotten']
sizes = [certified_fresh_percentage, rotten_percentage]
colors = ['#66c2a5', '#fc8d62']

plt.figure(figsize=(8, 8))
plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=140)
plt.title('Percentage of Films by Tomatometer Status')
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()


# 3.Explore new film releases over time. How has the volume of releases by month trended over time? What year/month were the newest films released?
# 

In [None]:
# Convert 'in_theaters_date' column to datetime
df['in_theaters_date'] = pd.to_datetime(df['in_theaters_date'], errors='coerce')

# Extract year and month from the 'in_theaters_date' column
df['release_year'] = df['in_theaters_date'].dt.year
df['release_month'] = df['in_theaters_date'].dt.month

# Group by year and month, and count the number of releases
new_releases_by_month = df.groupby(['release_year', 'release_month']).size()

# Plotting the trend of new releases by month over time
plt.figure(figsize=(12, 6))
new_releases_by_month.plot(marker='o')
plt.title('Trend of New Film Releases by Month Over Time')
plt.xlabel('Year-Month')
plt.ylabel('Number of New Releases')
plt.grid(True)
plt.xticks(rotation=45)
plt.show()

# Finding the newest film release
newest_release = df[df['in_theaters_date'] == df['in_theaters_date'].max()]
newest_release_info = newest_release[['movie_title', 'in_theaters_date']]

print("Newest film release:")
print(newest_release_info)


# 4.Compare average Tomatometer ratings by Studio. Which studios produce the highest-rated films, on average? The lowest?
# 

In [None]:
# Group by studio and calculate the average Tomatometer rating
average_tomatometer_ratings_by_studio = df.groupby('studio_name')['tomatometer_rating'].mean().sort_values(ascending=False)

# Print the studios with the highest and lowest average Tomatometer ratings
highest_rated_studio = average_tomatometer_ratings_by_studio.idxmax()
lowest_rated_studio = average_tomatometer_ratings_by_studio.idxmin()

print("Studios with the highest average Tomatometer ratings:")
print(highest_rated_studio, ":", average_tomatometer_ratings_by_studio[highest_rated_studio])

print("\nStudios with the lowest average Tomatometer ratings:")
print(lowest_rated_studio, ":", average_tomatometer_ratings_by_studio[lowest_rated_studio])


In [None]:
# Group by studio and calculate the average Tomatometer rating
average_tomatometer_ratings_by_studio = df.groupby('studio_name')['tomatometer_rating'].mean().sort_values(ascending=False)

# Plotting the average Tomatometer ratings by studio
plt.figure(figsize=(12, 6))
average_tomatometer_ratings_by_studio.plot(kind='bar')
plt.title('Average Tomatometer Ratings by Studio')
plt.xlabel('Studio')
plt.ylabel('Average Tomatometer Rating')
plt.xticks(rotation=90)
plt.grid(axis='y')
plt.show()


# 5.Compare the Tomatometer ratings against audience ratings. Which films showed the largest discrepancies between audiences and critics?
# 

In [None]:
# Calculate the absolute difference between Tomatometer ratings and audience ratings
df['rating_discrepancy'] = abs(df['tomatometer_rating'] - df['audience_rating'])

# Sort the DataFrame by rating discrepancy in descending order to find the largest discrepancies
largest_discrepancies = df.sort_values(by='rating_discrepancy', ascending=False)

# Print the top films with the largest discrepancies
print("Top Films with the Largest Discrepancies between Audiences and Critics:")
print(largest_discrepancies[['movie_title', 'tomatometer_rating', 'audience_rating', 'rating_discrepancy']].head(10))


In [None]:
# Create a scatter plot of Tomatometer ratings against audience ratings
plt.figure(figsize=(10, 6))
plt.scatter(df['tomatometer_rating'], df['audience_rating'], c=df['rating_discrepancy'], cmap='coolwarm', alpha=0.7)
plt.title('Tomatometer Ratings vs Audience Ratings')
plt.xlabel('Tomatometer Rating')
plt.ylabel('Audience Rating')
plt.colorbar(label='Rating Discrepancy')
plt.grid(True)
plt.show()


# 6.Explore the critic's consensus rating: what language is used most often?


In [None]:
from collections import Counter

# Assuming df is your DataFrame containing the movie data

# Drop rows with missing critic's consensus data
consensus_data = df['critics_consensus'].dropna()

# Tokenize the critic's consensus text into words
words = ' '.join(consensus_data).split()

# Count the occurrence of each word
word_counts = Counter(words)

# Get the most common language used
most_common_language = word_counts.most_common(1)[0][0]

print("The most common language used in the critic's consensus is:", most_common_language)


In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Assuming df is your DataFrame containing the movie data

# Drop rows with missing critic's consensus data
consensus_data = df['critics_consensus'].dropna()

# Join all critic's consensus text into a single string
consensus_text = ' '.join(consensus_data)

# Generate a word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(consensus_text)

# Display the word cloud
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title("Most Common Language in Critic's Consensus")
plt.axis('off')
plt.show()
