In [None]:
import pandas as pd

df = pd.read_csv('imbd.csv')
df.head()

In [None]:
from collections import Counter

# Handle missing or incorrect values in 'Genre'
df['Genre'] = df['Genre'].apply(lambda x: 'Unknown' if pd.isnull(x) else x)

# Split 'Genre' into separate genres and count occurrences
genre_counts = Counter(', '.join(df['Genre']).split(', '))

# Convert to DataFrame for easier manipulation
df_genre_counts = pd.DataFrame.from_dict(genre_counts, orient='index').reset_index()
df_genre_counts.columns = ['Genre', 'Count']

# Sort by count
df_genre_counts = df_genre_counts.sort_values('Count', ascending=False)

# Plotting
plt.figure(figsize=(10, 6))
plt.barh(df_genre_counts['Genre'], df_genre_counts['Count'], color='skyblue')
plt.title('Most Common Genres in Top 250 Movies')
plt.xlabel('Count')
plt.ylabel('Genre')
plt.gca().invert_yaxis()
plt.show()

In [None]:
import requests
import os

def fetch_movie_data(title):
    # Retrieve API key from environment variables
    api_key = os.getenv('OMDB_API_KEY')

    # Define the base URL of the OMDB API
    base_url = 'http://www.omdbapi.com/'

    # Define the parameters for the API request
    params = {
        'apikey': api_key,
        't': title
    }

    # Send GET request to the OMDB API
    response = requests.get(base_url, params=params)

    # Return the JSON response
    return response.json()

# Fetch data for one movie
movie_data = fetch_movie_data('The Shawshank Redemption')
movie_data

In [None]:
# Handle missing or incorrect values in 'Runtime'
df['Runtime'] = df['Runtime'].apply(lambda x: '0' if pd.isnull(x) else x)
df['Runtime'] = df['Runtime'].str.replace(' min', '').astype(int)

# Plotting
plt.figure(figsize=(10, 6))
plt.hist(df['Runtime'], bins=20, color='skyblue', edgecolor='black')
plt.title('Distribution of Runtimes in Top 250 Movies')
plt.xlabel('Runtime (minutes)')
plt.ylabel('Count')
plt.show()

In [None]:
# Split 'Country' into separate countries and count occurrences
country_counts = Counter(', '.join(df['Country']).split(', '))

# Convert to DataFrame for easier manipulation
df_country_counts = pd.DataFrame.from_dict(country_counts, orient='index').reset_index()
df_country_counts.columns = ['Country', 'Count']

# Sort by count
df_country_counts = df_country_counts.sort_values('Count', ascending=False)

# Plotting
plt.figure(figsize=(10, 6))
plt.barh(df_country_counts['Country'], df_country_counts['Count'], color='skyblue')
plt.title('Most Common Countries of Origin in Top 250 Movies')
plt.xlabel('Count')
plt.ylabel('Country')
plt.gca().invert_yaxis()
plt.show()

In [None]:
# Examine unique values in 'Awards' column
df['Awards'].unique()

In [None]:
import re

# Function to extract number of Oscars won
def extract_oscars(s):
    if pd.isnull(s):
        return 0
    else:
        oscars = re.findall(r'Won (\d+) Oscar', s)
        return int(oscars[0]) if oscars else 0

# Function to extract total number of wins
def extract_wins(s):
    if pd.isnull(s):
        return 0
    else:
        wins = re.findall(r'(\d+) win', s)
        return int(wins[0]) if wins else 0

# Function to extract total number of nominations
def extract_nominations(s):
    if pd.isnull(s):
        return 0
    else:
        nominations = re.findall(r'(\d+) nomination', s)
        return int(nominations[0]) if nominations else 0

# Apply the functions to the 'Awards' column
df['Oscars_Won'] = df['Awards'].apply(extract_oscars)
df['Total_Wins'] = df['Awards'].apply(extract_wins)
df['Total_Nominations'] = df['Awards'].apply(extract_nominations)

# Display the first few rows of the dataframe
df.head()

In [None]:
import matplotlib.pyplot as plt

# Create a scatter plot of the number of Oscars won versus movie ratings
plt.figure(figsize=(10, 6))
plt.scatter(df['Oscars_Won'], df['imdbRating'], alpha=0.5)
plt.title('Number of Oscars Won vs. Movie Ratings')
plt.xlabel('Number of Oscars Won')
plt.ylabel('Movie Rating')
plt.show()

In [None]:
# Create a scatter plot of the total number of wins versus movie ratings
plt.figure(figsize=(10, 6))
plt.scatter(df['Total_Wins'], df['imdbRating'], alpha=0.5)
plt.title('Total Number of Wins vs. Movie Ratings')
plt.xlabel('Total Number of Wins')
plt.ylabel('Movie Rating')
plt.show()