# IMDB Basic Analysis Using Pandas
# 1. Business Understanding
# 2. Data Understanding
# 3. Data Preparation
# 4. Modeling
# 5. Evaluation
# 6. Deployment

In [4]:
# import necessary libraries
import pandas as pd
import numpy as np

# set display_options
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width",500)

In [None]:
# load dataset
file_path = r"D:\PandasForDataAnalysis\datasets\imdb_top_1000.csv"
try:
    df = pd.read_csv(file_path)
    print("Dataset loads succesfully")
    print(df.head(10))
except FileNotFoundError:
    raise FileNotFoundError("Sorry, Dataset File Path Not Found.")


    

Dataset loads succesfully
                                         Poster_Link                                   Series_Title Released_Year Certificate  Runtime                      Genre  IMDB_Rating                                           Overview  Meta_score              Director              Star1                 Star2              Star3             Star4  No_of_Votes        Gross
0  https://m.media-amazon.com/images/M/MV5BMDFkYT...                       The Shawshank Redemption          1994           A  142 min                      Drama          9.3  Two imprisoned men bond over a number of years...        80.0        Frank Darabont        Tim Robbins        Morgan Freeman         Bob Gunton    William Sadler      2343110   28,341,469
1  https://m.media-amazon.com/images/M/MV5BM2MyNj...                                  The Godfather          1972           A  175 min               Crime, Drama          9.2  An organized crime dynasty's aging patriarch t...       100.0  Francis

In [8]:
# Dataset General Info

# 1. First 5 rows
first_five_rows = df.head(5)
last_five_rows = df.tail(5)

print("************ First Five Rows ************\n")
print(first_five_rows,"\n")
print("************ Last Five Rows ************\n")
print(last_five_rows,"\n")
print("\n")

df_columns = df.columns
print("********* Columns *********")
print(df_columns)

************ First Five Rows ************

                                         Poster_Link              Series_Title Released_Year Certificate  Runtime                 Genre  IMDB_Rating                                           Overview  Meta_score              Director           Star1           Star2          Star3           Star4  No_of_Votes        Gross
0  https://m.media-amazon.com/images/M/MV5BMDFkYT...  The Shawshank Redemption          1994           A  142 min                 Drama          9.3  Two imprisoned men bond over a number of years...        80.0        Frank Darabont     Tim Robbins  Morgan Freeman     Bob Gunton  William Sadler      2343110   28,341,469
1  https://m.media-amazon.com/images/M/MV5BM2MyNj...             The Godfather          1972           A  175 min          Crime, Drama          9.2  An organized crime dynasty's aging patriarch t...       100.0  Francis Ford Coppola   Marlon Brando       Al Pacino     James Caan    Diane Keaton      1620367  

In [12]:
def dataset_overview(dataframe):
    print("******************** Shape ********************")
    print(dataframe.shape)
    print("******************** Info ********************")
    print(dataframe.info)
    print("******************** Columns ********************")
    print(dataframe.columns)
    print("******************** Describe ********************")
    print(dataframe.describe())
    print("******************** Number of Rows ********************")
    print(dataframe.shape[0])
    print("******************** Number of Columns ********************")
    print(dataframe.shape[1])

In [13]:
# call the function
dataset_overview(dataframe = df)

******************** Shape ********************
(1000, 16)
******************** Info ********************
<bound method DataFrame.info of                                            Poster_Link              Series_Title Released_Year Certificate  Runtime                     Genre  IMDB_Rating                                           Overview  Meta_score              Director              Star1              Star2           Star3           Star4  No_of_Votes        Gross
0    https://m.media-amazon.com/images/M/MV5BMDFkYT...  The Shawshank Redemption          1994           A  142 min                     Drama          9.3  Two imprisoned men bond over a number of years...        80.0        Frank Darabont        Tim Robbins     Morgan Freeman      Bob Gunton  William Sadler      2343110   28,341,469
1    https://m.media-amazon.com/images/M/MV5BM2MyNj...             The Godfather          1972           A  175 min              Crime, Drama          9.2  An organized crime dynasty's aging

In [20]:
# 'Poster_Link', 'Series_Title', 'Released_Year', 'Certificate', 'Runtime', 'Genre', 'IMDB_Rating', 'Overview', 'Meta_score', 'Director', 'Star1', 'Star2', 'Star3', 'Star4', 'No_of_Votes', 'Gross']
print("Distribution of Genre\n")
df["Genre"].value_counts()





Distribution of Genre



Genre
Drama                            85
Drama, Romance                   37
Comedy, Drama                    35
Comedy, Drama, Romance           31
Action, Crime, Drama             30
Biography, Drama, History        28
Crime, Drama, Thriller           28
Crime, Drama, Mystery            27
Crime, Drama                     26
Animation, Adventure, Comedy     24
Action, Adventure, Sci-Fi        21
Biography, Crime, Drama          16
Drama, War                       15
Comedy, Crime, Drama             15
Action, Adventure, Drama         14
Drama, Thriller                  14
Comedy                           13
Biography, Drama                 12
Animation, Action, Adventure     11
Action, Adventure, Comedy        10
Action, Crime, Thriller          10
Animation, Adventure, Family      9
Drama, Mystery, Thriller          9
Mystery, Thriller                 9
Biography, Drama, Sport           9
Adventure, Comedy, Drama          8
Action, Biography, Drama          7
Comedy, Drama, Family 

In [None]:
# Genre 
print("Distribution of Released Year")
df["Released_Year"].value_counts()

In [25]:
# data types of columns
for col in df.columns:
    print(f"Data types of {col} = {df[col].dtype}")

Data types of Poster_Link = object
Data types of Series_Title = object
Data types of Released_Year = object
Data types of Certificate = object
Data types of Runtime = object
Data types of Genre = object
Data types of IMDB_Rating = float64
Data types of Overview = object
Data types of Meta_score = float64
Data types of Director = object
Data types of Star1 = object
Data types of Star2 = object
Data types of Star3 = object
Data types of Star4 = object
Data types of No_of_Votes = int64
Data types of Gross = object
