# IMDB Top 1000 Movies Analysis
## Top 10 Recommendations

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set up visualization style
%matplotlib inline
sns.set(style="whitegrid")

In [None]:
# Load the dataset
try:
    df = pd.read_csv('imdb_top_1000.csv')
    print("Dataset loaded successfully!")
    print(f"Dataset shape: {df.shape}")
except FileNotFoundError:
    print("Please download the dataset from Kaggle and update the file path")
    print("Dataset link: https://www.kaggle.com/datasets/harshitshankhdhar/imdb-dataset-of-top-1000-movies-and-tv-shows")

In [None]:
# Initial data exploration
if 'df' in locals():
    print("\nDataset Info:")
    display(df.info())
    
    print("\nFirst 5 rows:")
    display(df.head())
    
    print("\nMissing values:")
    display(df.isnull().sum())

In [None]:
# Check for TV series (though this dataset appears to be movies only)
if 'df' in locals():
    print("\nChecking for TV series in the dataset:")
    
    # Check certificate and genre for TV indicators
    print("\nCertificate value counts:")
    display(df['Certificate'].value_counts())
    
    print("\nGenre value counts:")
    display(df['Genre'].value_counts().head(20))
    
    # Search for TV-related terms in title
    tv_keywords = ['season', 'series', 'tv', 'episode']
    tv_shows = df[df['Series_Title'].str.contains('|'.join(tv_keywords), case=False, regex=True)]
    
    if not tv_shows.empty:
        print(f"\nFound {len(tv_shows)} potential TV shows:")
        display(tv_shows[['Series_Title', 'Released_Year', 'IMDB_Rating', 'Genre']])
    else:
        print("\nNo TV shows found in this dataset.")

In [None]:
# Top 10 Movies Analysis
if 'df' in locals():
    print("\nSince this dataset contains movies, here are the top 10 recommendations:")
    
    # Clean and prepare data
    df['IMDB_Rating'] = pd.to_numeric(df['IMDB_Rating'], errors='coerce')
    df['Meta_score'] = pd.to_numeric(df['Meta_score'], errors='coerce')
    
    # Create a weighted score (70% IMDB + 30% Metascore)
    df['Weighted_Score'] = (0.7 * df['IMDB_Rating']) + (0.3 * (df['Meta_score']/10))
    
    # Get top 10 movies
    top_10 = df.sort_values('Weighted_Score', ascending=False).head(10)
    
    # Display results
    display(top_10[['Series_Title', 'Released_Year', 'IMDB_Rating', 'Meta_score', 'Weighted_Score', 'Genre', 'Overview']])
    
    # Visualization
    plt.figure(figsize=(12, 6))
    sns.barplot(x='Weighted_Score', y='Series_Title', data=top_10, palette='viridis')
    plt.title('Top 10 Movies by Weighted Score (IMDB + Metascore)')
    plt.xlabel('Weighted Score')
    plt.ylabel('Movie Title')
    plt.tight_layout()
    plt.show()

In [None]:
# Genre Analysis
if 'df' in locals():
    print("\nGenre distribution among top movies:")
    
    # Explode genres (some movies have multiple genres)
    genres = df['Genre'].str.split(',').explode().str.strip()
    
    # Plot genre distribution
    plt.figure(figsize=(12, 6))
    genres.value_counts().plot(kind='bar', color='teal')
    plt.title('Genre Distribution in Top 1000 IMDB Movies')
    plt.xlabel('Genre')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

## Key Observations

1. **Dataset Composition**: The IMDB Top 1000 dataset appears to contain only movies, not TV series.

2. **Top Movies**: The highest-rated movies include classics like "The Shawshank Redemption", "The Godfather", and "The Dark Knight".

3. **Genre Trends**: Drama and Crime genres dominate the top rankings, followed by Action and Adventure.

4. **Data Quality**: Some cleaning was needed for numeric fields like ratings and metascore.

5. **Recommendation Basis**: Used a weighted score combining IMDB ratings (70%) and Metascores (30%) for balanced recommendations.

6. **For TV Series**: A different dataset would be needed for proper TV series analysis.

## Alternative TV Series Dataset

For analyzing TV series instead of movies, consider this dataset:

- [IMDB TV Shows Dataset](https://www.kaggle.com/datasets/ruchi798/imdb-tv-shows-dataset)

The analysis approach would be similar but would focus on:
- Number of seasons
- Episode ratings
- TV-specific genres
- Recentness of shows