In [None]:
from pymongo import MongoClient

In [None]:
import pandas as pd

In [None]:
client = MongoClient('mongodb://localhost:27017/')

In [None]:
client.list_database_names()

In [None]:
db = client["Ma-Sa-La"]

In [None]:
collection = db["a2021"]

In [None]:
a2021 = db["a2021"]

In [None]:
a2021.count_documents({})

In [None]:
for st in a2021.find():
    print(st)

In [None]:
df = pd.DataFrame(list(a2021.find()))

In [None]:
df.head()

In [None]:
df[['genre', 'genre1', 'genre2']].describe(include='all')

In [None]:
# Analyzing the director and cast pairings
director_cast_df = df[['Director Name', 'Cast-1 Name', 'Cast-2 Name', 'Cast-3 Name', 'Cast-4 Name']]

# Grouping by director and cast names to find common pairings
director_cast_grouped = director_cast_df.groupby(['Director Name', 'Cast-1 Name', 'Cast-2 Name', 'Cast-3 Name', 'Cast-4 Name']).size().reset_index(name='Count')

# Sorting the pairings by frequency
director_cast_grouped.sort_values(by='Count', ascending=False).head()

In [None]:
# Analyzing the top 5 genres
# We will count the occurrences of each genre and genre1, genre2 combinations
genre_counts = df['genre'].str.strip().value_counts().head(5)
genre1_counts = df['genre1'].str.strip().value_counts().head(5)
genre2_counts = df['genre2'].str.strip().value_counts().head(5)

# Display the top 5 genres for each genre column
print('Top 5 genres:')
print(genre_counts)
print('\nTop 5 genre1:')
print(genre1_counts)
print('\nTop 5 genre2:')
print(genre2_counts)

In [None]:
# Analyzing the most common director and cast pairs
# We will create a new column that combines director name and the names of the first two cast members
df['director_cast_pair'] = df['Director Name'] + ', ' + df['Cast-1 Name'] + ', ' + df['Cast-2 Name']

# Now we count the occurrences of each pair
director_cast_pair_counts = df['director_cast_pair'].value_counts().head(5)

# Display the top 5 director and cast pairs
print('Top 5 director and cast pairs:')
print(director_cast_pair_counts)

# Bar Graph

In [None]:
import matplotlib.pyplot as plt

# Combine all genre columns into a single series
all_genres = pd.concat([df['genre'].str.strip(), df['genre1'].str.strip(), df['genre2'].str.strip()])

# Count the occurrences of each genre
genre_counts_combined = all_genres.value_counts()

# Plot the genre graph
plt.figure(figsize=(10, 8))
genre_counts_combined.head(10).plot(kind='bar')
plt.title('Top 10 Genres')
plt.xlabel('Genre')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Pie Chart

In [None]:
# Generate a pie chart for the genre distribution

# We will use the combined genre counts for this
plt.figure(figsize=(10, 8))
genre_counts_combined.head(10).plot(kind='pie', autopct='%1.1f%%')
plt.title('Genre Distribution - Top 10')
plt.ylabel('')  # Hide the y-label
plt.show()