## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## Load Data

In [None]:
df = pd.read_csv('../../data/shows_transformed.csv')
print(f"Dataset shape: {df.shape}")
df.head()

## Dataset Overview

In [None]:
# Basic statistics
print("Basic Statistics:")
print(df[['score', 'members', 'favorites', 'duration_minutes', 'completion_rate', 'drop_rate']].describe())

## Score Distribution

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Histogram
ax1.hist(df['score'], bins=30, color='skyblue', edgecolor='black', alpha=0.7)
ax1.set_xlabel('Score')
ax1.set_ylabel('Frequency')
ax1.set_title('Distribution of Anime Scores')
ax1.axvline(df['score'].mean(), color='red', linestyle='--', label=f'Mean: {df["score"].mean():.2f}')
ax1.legend()

# Box plot
ax2.boxplot(df['score'], vert=True)
ax2.set_ylabel('Score')
ax2.set_title('Box Plot of Anime Scores')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Top 20 Anime by Score

In [None]:
top_20 = df.nsmallest(20, 'rank')[['title', 'score', 'rank']]

plt.figure(figsize=(12, 8))
plt.barh(range(len(top_20)), top_20['score'], color='coral')
plt.yticks(range(len(top_20)), top_20['title'])
plt.xlabel('Score')
plt.title('Top 20 Anime by Score')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## Popularity Metrics

## Popularity Metrics

## Score vs Popularity Correlation

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Score vs Members
ax1.scatter(df['score'], df['members'], alpha=0.5, c='blue')
ax1.set_xlabel('Score')
ax1.set_ylabel('Members')
ax1.set_title('Score vs Members')
ax1.set_yscale('log')

# Add correlation
corr1 = df['score'].corr(df['members'])
ax1.text(0.05, 0.95, f'Correlation: {corr1:.3f}', 
         transform=ax1.transAxes, verticalalignment='top',
         bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

# Score vs Completion Rate
ax2.scatter(df['score'], df['completion_rate'], alpha=0.5, c='green')
ax2.set_xlabel('Score')
ax2.set_ylabel('Completion Rate (%)')
ax2.set_title('Score vs Completion Rate')

# Add correlation
corr2 = df['score'].corr(df['completion_rate'])
ax2.text(0.05, 0.95, f'Correlation: {corr2:.3f}', 
         transform=ax2.transAxes, verticalalignment='top',
         bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

plt.tight_layout()
plt.show()

## Anime Types Distribution

In [None]:
type_counts = df['type'].value_counts()

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Bar chart
type_counts.plot(kind='bar', ax=ax1, color='steelblue')
ax1.set_xlabel('Type')
ax1.set_ylabel('Count')
ax1.set_title('Distribution of Anime Types')
ax1.tick_params(axis='x', rotation=45)

# Pie chart
ax2.pie(type_counts, labels=type_counts.index, autopct='%1.1f%%', startangle=90)
ax2.set_title('Anime Types Proportion')

plt.tight_layout()
plt.show()

## Average Score by Type

In [None]:
avg_score_by_type = df.groupby('type')['score'].mean().sort_values(ascending=False)

plt.figure(figsize=(10, 6))
avg_score_by_type.plot(kind='bar', color='mediumseagreen')
plt.xlabel('Type')
plt.ylabel('Average Score')
plt.title('Average Score by Anime Type')
plt.xticks(rotation=45)
plt.axhline(y=df['score'].mean(), color='red', linestyle='--', label='Overall Average')
plt.legend()
plt.tight_layout()
plt.show()

## Status Distribution

In [None]:
status_counts = df['status'].value_counts()

plt.figure(figsize=(10, 6))
status_counts.plot(kind='bar', color='orchid')
plt.xlabel('Status')
plt.ylabel('Count')
plt.title('Distribution of Anime Status')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Duration Analysis

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Duration distribution
axes[0].hist(df['duration_minutes'], bins=50, color='lightcoral', edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Duration (minutes)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of Episode Duration')
axes[0].axvline(df['duration_minutes'].median(), color='red', linestyle='--', 
                label=f'Median: {df["duration_minutes"].median():.0f} min')
axes[0].legend()

# Duration vs Score
axes[1].scatter(df['duration_minutes'], df['score'], alpha=0.5, c='purple')
axes[1].set_xlabel('Duration (minutes)')
axes[1].set_ylabel('Score')
axes[1].set_title('Duration vs Score')

corr = df['duration_minutes'].corr(df['score'])
axes[1].text(0.05, 0.95, f'Correlation: {corr:.3f}', 
            transform=axes[1].transAxes, verticalalignment='top',
            bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

plt.tight_layout()
plt.show()

## Top Genres Analysis

In [None]:
# Parse genres_list from string representation
import ast

# Get all genres
all_genres = []
for genres_str in df['genres_list'].dropna():
    try:
        genres = ast.literal_eval(genres_str)
        all_genres.extend(genres)
    except:
        pass

genre_counts = Counter(all_genres)
top_genres = dict(genre_counts.most_common(15))

plt.figure(figsize=(12, 6))
plt.bar(top_genres.keys(), top_genres.values(), color='teal')
plt.xlabel('Genre')
plt.ylabel('Count')
plt.title('Top 15 Anime Genres')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

print(f"\nTotal unique genres: {len(genre_counts)}")

## Top Studios Analysis

In [None]:
# Get all studios
all_studios = []
for studios_str in df['studios_list'].dropna():
    try:
        studios = ast.literal_eval(studios_str)
        all_studios.extend(studios)
    except:
        pass

studio_counts = Counter(all_studios)
top_studios = dict(studio_counts.most_common(15))

plt.figure(figsize=(12, 6))
plt.bar(top_studios.keys(), top_studios.values(), color='darkorange')
plt.xlabel('Studio')
plt.ylabel('Count')
plt.title('Top 15 Anime Studios')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

print(f"\nTotal unique studios: {len(studio_counts)}")

## Rating Distribution

In [None]:
rating_counts = df['rating'].value_counts()

plt.figure(figsize=(12, 6))
rating_counts.plot(kind='barh', color='mediumpurple')
plt.xlabel('Count')
plt.ylabel('Rating')
plt.title('Distribution of Content Ratings')
plt.tight_layout()
plt.show()

## Completion Rate vs Drop Rate

In [None]:
plt.figure(figsize=(10, 8))
scatter = plt.scatter(df['completion_rate'], df['drop_rate'], 
                     c=df['score'], cmap='viridis', alpha=0.6, s=50)
plt.xlabel('Completion Rate (%)')
plt.ylabel('Drop Rate (%)')
plt.title('Completion Rate vs Drop Rate (colored by Score)')
plt.colorbar(scatter, label='Score')

# Add correlation
corr = df['completion_rate'].corr(df['drop_rate'])
plt.text(0.05, 0.95, f'Correlation: {corr:.3f}', 
         transform=plt.gca().transAxes, verticalalignment='top',
         bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

plt.tight_layout()
plt.show()

## Favorites Rate Analysis

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Favorites rate distribution
axes[0].hist(df['favorites_rate'], bins=50, color='hotpink', edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Favorites Rate (%)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of Favorites Rate')

# Favorites rate vs Score
axes[1].scatter(df['score'], df['favorites_rate'], alpha=0.5, c='crimson')
axes[1].set_xlabel('Score')
axes[1].set_ylabel('Favorites Rate (%)')
axes[1].set_title('Score vs Favorites Rate')

corr = df['score'].corr(df['favorites_rate'])
axes[1].text(0.05, 0.95, f'Correlation: {corr:.3f}', 
            transform=axes[1].transAxes, verticalalignment='top',
            bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

plt.tight_layout()
plt.show()

## Currently Airing vs Finished

In [None]:
airing_counts = df['is_airing'].value_counts()
labels = ['Finished', 'Currently Airing']

fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Pie chart
axes[0].pie(airing_counts, labels=labels, autopct='%1.1f%%', startangle=90, colors=['lightblue', 'lightcoral'])
axes[0].set_title('Currently Airing vs Finished Anime')

# Average score comparison
avg_scores = df.groupby('is_airing')['score'].mean()
axes[1].bar(labels, avg_scores, color=['lightblue', 'lightcoral'])
axes[1].set_ylabel('Average Score')
axes[1].set_title('Average Score: Currently Airing vs Finished')
axes[1].axhline(y=df['score'].mean(), color='red', linestyle='--', label='Overall Average')
axes[1].legend()

plt.tight_layout()
plt.show()

## Correlation Heatmap

In [None]:
# Select numeric columns for correlation
numeric_cols = ['score', 'rank', 'popularity', 'members', 'favorites', 
                'duration_minutes', 'completion_rate', 'drop_rate', 'favorites_rate']

correlation_matrix = df[numeric_cols].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=1)
plt.title('Correlation Heatmap of Numeric Features')
plt.tight_layout()
plt.show()

## Key Insights Summary

In [None]:
print("=" * 60)
print("KEY INSIGHTS FROM THE DATASET")
print("=" * 60)
print(f"\n1. Dataset Size: {len(df)} anime entries")
print(f"\n2. Score Statistics:")
print(f"   - Average Score: {df['score'].mean():.2f}")
print(f"   - Highest Score: {df['score'].max():.2f} ({df.loc[df['score'].idxmax(), 'title']})")
print(f"   - Lowest Score: {df['score'].min():.2f}")
print(f"\n3. Popularity:")
print(f"   - Most Members: {df['members'].max():,} ({df.loc[df['members'].idxmax(), 'title']})")
print(f"   - Most Favorites: {df['favorites'].max():,} ({df.loc[df['favorites'].idxmax(), 'title']})")
print(f"\n4. Engagement Metrics:")
print(f"   - Average Completion Rate: {df['completion_rate'].mean():.2f}%")
print(f"   - Average Drop Rate: {df['drop_rate'].mean():.2f}%")
print(f"   - Average Favorites Rate: {df['favorites_rate'].mean():.2f}%")
print(f"\n5. Content Types:")
for type_name, count in df['type'].value_counts().items():
    print(f"   - {type_name}: {count} ({count/len(df)*100:.1f}%)")
print(f"\n6. Duration:")
print(f"   - Average Duration: {df['duration_minutes'].mean():.0f} minutes")
print(f"   - Median Duration: {df['duration_minutes'].median():.0f} minutes")
print("=" * 60)