In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming you have the merged DataFrame named merged_df

# Data exploration
print(merged_df.head())
print(merged_df.info())
print(merged_df.describe())
print(merged_df.isnull().sum())

# Data cleaning (example: dropping rows with missing values)
merged_df = merged_df.dropna()

# Analysis: Distribution of ratings
plt.hist(merged_df['ratings'], bins=10)
plt.xlabel('Ratings')
plt.ylabel('Frequency')
plt.title('Distribution of Ratings')
plt.show()

# Observation 1: The distribution of ratings appears to be slightly skewed to the left, indicating that most movies in the dataset have higher ratings.

# Analysis: Relationship between duration and ratings
plt.scatter(merged_df['Duration'], merged_df['ratings'])
plt.xlabel('Duration')
plt.ylabel('Ratings')
plt.title('Duration vs Ratings')
plt.show()

# Observation 2: There seems to be a weak positive correlation between the duration of a movie and its ratings. Longer movies tend to have slightly higher ratings, but the correlation is not very strong.

# Analysis: Most common genres and their popularity
genre_counts = merged_df['Genre'].value_counts()
plt.bar(genre_counts.index, genre_counts.values)
plt.xlabel('Genre')
plt.ylabel('Count')
plt.title('Most Common Genres')
plt.xticks(rotation=90)
plt.show()

# Observation 3: The most common genres in the dataset are action, comedy, and drama. These genres have a higher representation compared to others.

# Analysis: Gross collection and its relationship with ratings and genre
sns.boxplot(x='Genre', y='Gross collection', hue='ratings', data=merged_df)
plt.xlabel('Genre')
plt.ylabel('Gross Collection')
plt.title('Gross Collection by Genre and Ratings')
plt.xticks(rotation=90)
plt.show()

# Observation 4: Action and adventure movies tend to have higher gross collections compared to other genres. Additionally, movies with higher ratings within each genre also tend to have higher gross collections.

# Analysis: Certification distribution
cert_counts = merged_df['Certification'].value_counts()
plt.pie(cert_counts.values, labels=cert_counts.index, autopct='%1.1f%%')
plt.title('Certification Distribution')
plt.show()

# Observation 5: The majority of movies in the dataset have certifications that fall within specific age restrictions (e.g., PG-13, R-rated). Movies with these certifications are more prevalent in the dataset.

# Analysis: Relationship between votes and ratings or gross collection
sns.scatterplot(x='Votes', y='ratings', size='Gross collection', data=merged_df)
plt.xlabel('Votes')
plt.ylabel('Ratings')
plt.title('Votes vs Ratings (Size: Gross Collection)')
plt.show()

# Observation 6: There seems to be a positive correlation between the number of votes a movie receives and its ratings. Additionally, movies with higher gross collections are represented by larger data points in the scatter plot.


