In [38]:
import pandas as pd
import numpy as np

In [None]:
# Load CSV
df = pd.read_csv("imdb_top_1000.csv")
# dataset info and first few rows
print(df.info())
print(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Poster_Link    1000 non-null   object 
 1   Series_Title   1000 non-null   object 
 2   Released_Year  1000 non-null   object 
 3   Certificate    899 non-null    object 
 4   Runtime        1000 non-null   object 
 5   Genre          1000 non-null   object 
 6   IMDB_Rating    1000 non-null   float64
 7   Overview       1000 non-null   object 
 8   Meta_score     843 non-null    float64
 9   Director       1000 non-null   object 
 10  Star1          1000 non-null   object 
 11  Star2          1000 non-null   object 
 12  Star3          1000 non-null   object 
 13  Star4          1000 non-null   object 
 14  No_of_Votes    1000 non-null   int64  
 15  Gross          831 non-null    object 
dtypes: float64(2), int64(1), object(13)
memory usage: 125.1+ KB
None
                                

In [None]:
# Count nulls column
null_counts = df.isnull().sum()
print("Null values per column:\n", null_counts)

# Total null values
total_nulls = df.isnull().sum().sum()
print("\nTotal null values in dataset:", total_nulls)


Null values per column:
 Poster_Link        0
Series_Title       0
Released_Year      0
Certificate      101
Runtime            0
Genre              0
IMDB_Rating        0
Overview           0
Meta_score       157
Director           0
Star1              0
Star2              0
Star3              0
Star4              0
No_of_Votes        0
Gross            169
dtype: int64

Total null values in dataset: 427


In [41]:
# Filling missing 'Certificate' with 'Not Rated'
df['Certificate'] = df['Certificate'].fillna('Not Rated')

# Filling missing 'Meta_score' with median
df['Meta_score'] = df['Meta_score'].fillna(df['Meta_score'].median())

# Filling missing 'Gross' with 0 and convert to float
df['Gross'] = df['Gross'].fillna('0')
df['Gross'] = df['Gross'].str.replace(',', '').astype(float)

# Verify nulls handled
print(df.isnull().sum())


Poster_Link      0
Series_Title     0
Released_Year    0
Certificate      0
Runtime          0
Genre            0
IMDB_Rating      0
Overview         0
Meta_score       0
Director         0
Star1            0
Star2            0
Star3            0
Star4            0
No_of_Votes      0
Gross            0
dtype: int64


In [None]:
# Remove anything that is not a number
df['Released_Year'] = df['Released_Year'].str.extract('(\d+)')

df['Released_Year'] = df['Released_Year'].astype(float)

# Keep only first genre if multiple
df['Genre'] = df['Genre'].apply(lambda x: x.split(',')[0])

# Check first 5 rows after cleaning
print(df.head())


                                         Poster_Link  \
0  https://m.media-amazon.com/images/M/MV5BMDFkYT...   
1  https://m.media-amazon.com/images/M/MV5BM2MyNj...   
2  https://m.media-amazon.com/images/M/MV5BMTMxNT...   
3  https://m.media-amazon.com/images/M/MV5BMWMwMG...   
4  https://m.media-amazon.com/images/M/MV5BMWU4N2...   

               Series_Title  Released_Year Certificate  Runtime   Genre  \
0  The Shawshank Redemption         1994.0           A  142 min   Drama   
1             The Godfather         1972.0           A  175 min   Crime   
2           The Dark Knight         2008.0          UA  152 min  Action   
3    The Godfather: Part II         1974.0           A  202 min   Crime   
4              12 Angry Men         1957.0           U   96 min   Crime   

   IMDB_Rating                                           Overview  Meta_score  \
0          9.3  Two imprisoned men bond over a number of years...        80.0   
1          9.2  An organized crime dynasty's aging

  df['Released_Year'] = df['Released_Year'].str.extract('(\d+)')


In [None]:

# A. Average IMDB rating by genre
genre_ratings = df.groupby('Genre')['IMDB_Rating'].mean().sort_values(ascending=False)
print("Average Rating by Genre:\n", genre_ratings)

# B. Top 10 movies by IMDB rating
top_movies = df.sort_values(by='IMDB_Rating', ascending=False).head(10)
print("\nTop 10 Movies:\n", top_movies[['Series_Title','IMDB_Rating','Genre','Director','Gross']])

# C. Correlation between Meta_score and IMDB Rating
corr_meta = np.corrcoef(df['Meta_score'], df['IMDB_Rating'])[0,1]
print("\nCorrelation Meta_score vs IMDB Rating:", round(corr_meta,3))

# D. Correlation between Gross and IMDB Rating
corr_gross = np.corrcoef(df['Gross'], df['IMDB_Rating'])[0,1]
print("Correlation Gross vs IMDB Rating:", round(corr_gross,3))

# E. Movies per Released Year
movies_per_year = df['Released_Year'].value_counts().sort_index()
print("\nMovies Per Year:\n", movies_per_year)


Average Rating by Genre:
 Genre
Western      8.350000
Crime        8.016822
Fantasy      8.000000
Mystery      7.975000
Film-Noir    7.966667
Drama        7.957439
Action       7.949419
Biography    7.938636
Adventure    7.937500
Animation    7.930488
Horror       7.909091
Comedy       7.901290
Family       7.800000
Thriller     7.800000
Name: IMDB_Rating, dtype: float64

Top 10 Movies:
                                          Series_Title  IMDB_Rating      Genre  \
0                            The Shawshank Redemption          9.3      Drama   
1                                       The Godfather          9.2      Crime   
2                                     The Dark Knight          9.0     Action   
3                              The Godfather: Part II          9.0      Crime   
4                                        12 Angry Men          9.0      Crime   
5       The Lord of the Rings: The Return of the King          8.9     Action   
6                                        P