In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns



In [None]:
df = pd.read_csv('train.csv/train.csv')

In [None]:
# Set pandas options to display full content
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)



In [None]:
# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())

In [None]:
# Load test data
test_df = pd.read_csv('test.csv/test.csv')
print(f"Test data shape: {test_df.shape}")
print("\nTest data columns:")
print(test_df.columns.tolist())

In [None]:
df_clean = df[df['revenue'] > 0].copy()
print(f"Rows after removing zero/negative revenue: {len(df_clean)}")

In [None]:
df_clean.head(1)

In [None]:
print("Dataset Info:")
df.info()
print("\n" + "="*50)
print("Dataset columns:")
print(df.columns.tolist())

In [None]:
# Look at the revenue column (target variable)
print("Revenue statistics:")
print(df['revenue'].describe())
print(f"\nZero revenues: {(df['revenue'] == 0).sum()}")
print(f"Negative revenues: {(df['revenue'] < 0).sum()}")

# Show revenue distribution
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.hist(df['revenue'], bins=50, alpha=0.7)
plt.title('Revenue Distribution')
plt.xlabel('Revenue ($)')
plt.ylabel('Frequency')


plt.show()

In [44]:
# Examine key numeric features
numeric_cols = ['budget', 'popularity', 'runtime', 'revenue']

print("Numeric Features Summary:")
print(df[numeric_cols].describe())

print(f"\nCorrelations with Revenue:")
for col in ['budget', 'popularity', 'runtime']:
    corr = df[col].corr(df['revenue'])
    print(f"{col}: {corr:.3f}")

Numeric Features Summary:
             budget   popularity      runtime       revenue
count  3.000000e+03  3000.000000  2998.000000  3.000000e+03
mean   2.253133e+07     8.463274   107.856571  6.672585e+07
std    3.702609e+07    12.104000    22.086434  1.375323e+08
min    0.000000e+00     0.000001     0.000000  1.000000e+00
25%    0.000000e+00     4.018053    94.000000  2.379808e+06
50%    8.000000e+06     7.374861   104.000000  1.680707e+07
75%    2.900000e+07    10.890983   118.000000  6.891920e+07
max    3.800000e+08   294.337037   338.000000  1.519558e+09

Correlations with Revenue:
budget: 0.753
popularity: 0.461
runtime: 0.216


In [43]:
# Deep exploration of text features - these are likely our most predictive features!
print("=== GENRES ANALYSIS ===")
print(f"Sample genres (first 5):")
for i, genre in enumerate(df['genres'].dropna().head(5)):
    print(f"{i+1}. {genre}")

print(f"\nGenres data type: {type(df['genres'].iloc[0])}")
print(f"Missing genres: {df['genres'].isnull().sum()}")

print("\n" + "="*60)
print("=== CAST ANALYSIS ===")
print(f"Sample cast (first 3):")
for i, cast in enumerate(df['cast'].dropna().head(3)):
    print(f"{i+1}. {cast[:200]}...")  # Show first 200 characters

print(f"\nMissing cast: {df['cast'].isnull().sum()}")

print("\n" + "="*60)
print("=== PRODUCTION COMPANIES ANALYSIS ===")
print(f"Sample production companies (first 5):")
for i, company in enumerate(df['production_companies'].dropna().head(5)):
    print(f"{i+1}. {company}")

print(f"\nMissing production companies: {df['production_companies'].isnull().sum()}")

print("\n" + "="*60)
print("=== KEYWORDS ANALYSIS ===")
print(f"Sample keywords (first 3):")
for i, keywords in enumerate(df['Keywords'].dropna().head(3)):
    print(f"{i+1}. {keywords[:200]}...")  # Show first 200 characters

print(f"\nMissing keywords: {df['Keywords'].isnull().sum()}")

=== GENRES ANALYSIS ===
Sample genres (first 5):
1. [{'id': 35, 'name': 'Comedy'}]
2. [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'name': 'Drama'}, {'id': 10751, 'name': 'Family'}, {'id': 10749, 'name': 'Romance'}]
3. [{'id': 18, 'name': 'Drama'}]
4. [{'id': 53, 'name': 'Thriller'}, {'id': 18, 'name': 'Drama'}]
5. [{'id': 28, 'name': 'Action'}, {'id': 53, 'name': 'Thriller'}]

Genres data type: <class 'str'>
Missing genres: 7

=== CAST ANALYSIS ===
Sample cast (first 3):
1. [{'cast_id': 4, 'character': 'Lou', 'credit_id': '52fe4ee7c3a36847f82afae7', 'gender': 2, 'id': 52997, 'name': 'Rob Corddry', 'order': 0, 'profile_path': '/k2zJL0V1nEZuFT08xUdOd3ucfXz.jpg'}, {'cast_id...
2. [{'cast_id': 1, 'character': 'Mia Thermopolis', 'credit_id': '52fe43fe9251416c7502561f', 'gender': 1, 'id': 1813, 'name': 'Anne Hathaway', 'order': 0, 'profile_path': '/jUMOKwSUBnTcMeN1HfhutiY49Ad.jpg...
3. [{'cast_id': 5, 'character': 'Andrew Neimann', 'credit_id': '52fe4ef7c3a36847f82b3fc3', 'gender': 2, 'id': 99

In [42]:
import json
import ast
from collections import Counter

# Function to safely parse JSON-like strings
def parse_json_column(text):
    if pd.isna(text):
        return []
    try:
        # Try to parse as JSON first
        return json.loads(text.replace("'", '"'))
    except:
        try:
            # Try literal_eval for Python-like strings
            return ast.literal_eval(text)
        except:
            return []

print("=== PARSING GENRES ===")
# Parse genres and extract genre names
df['genres_parsed'] = df['genres'].apply(parse_json_column)
df['genre_names'] = df['genres_parsed'].apply(lambda x: [item['name'] for item in x if isinstance(item, dict) and 'name' in item])

# Count most common genres
all_genres = []
for genres in df['genre_names']:
    all_genres.extend(genres)

genre_counts = Counter(all_genres)
print("Top 10 most common genres:")
for genre, count in genre_counts.most_common(10):
    print(f"{genre}: {count} movies")

print(f"\nTotal unique genres: {len(genre_counts)}")

=== PARSING GENRES ===
Top 10 most common genres:
Drama: 1531 movies
Comedy: 1028 movies
Thriller: 789 movies
Action: 741 movies
Romance: 571 movies
Crime: 469 movies
Adventure: 439 movies
Horror: 301 movies
Science Fiction: 290 movies
Family: 260 movies

Total unique genres: 20


In [45]:
# Create binary features for top genres
top_genres = ['Drama', 'Comedy', 'Thriller', 'Action', 'Romance', 'Crime', 'Adventure', 'Horror']

print("=== CREATING GENRE FEATURES ===")
for genre in top_genres:
    df[f'is_{genre}'] = df['genre_names'].apply(lambda x: 1 if genre in x else 0)
    count = df[f'is_{genre}'].sum()
    print(f"is_{genre}: {count} movies")

print("\n=== GENRE CORRELATIONS WITH REVENUE ===")
genre_correlations = []
for genre in top_genres:
    corr = df[f'is_{genre}'].corr(df['revenue'])
    genre_correlations.append((genre, corr))
    print(f"is_{genre}: {corr:.3f}")

# Sort by correlation strength
genre_correlations.sort(key=lambda x: abs(x[1]), reverse=True)
print(f"\nGenres ranked by correlation strength:")
for genre, corr in genre_correlations:
    print(f"{genre}: {corr:.3f}")

=== CREATING GENRE FEATURES ===
is_Drama: 1531 movies
is_Comedy: 1028 movies
is_Thriller: 789 movies
is_Action: 741 movies
is_Romance: 571 movies
is_Crime: 469 movies
is_Adventure: 439 movies
is_Horror: 301 movies

=== GENRE CORRELATIONS WITH REVENUE ===
is_Drama: -0.158
is_Comedy: -0.031
is_Thriller: 0.007
is_Action: 0.185
is_Romance: -0.049
is_Crime: -0.023
is_Adventure: 0.328
is_Horror: -0.034

Genres ranked by correlation strength:
Adventure: 0.328
Action: 0.185
Drama: -0.158
Romance: -0.049
Horror: -0.034
Comedy: -0.031
Crime: -0.023
Thriller: 0.007
