## Import Required Packages

In [1]:
# cutecharts is used to create 📉 Hand drawing style charts library for Python
import cutecharts.charts as ctc
import pandas as pd  # for data loading and data analysis

## Load the Dataset

In [2]:
# loads 'tmdb-movies.csv' file as a dataframe
df = pd.read_csv("dataset/tmdb-movies.csv")

# displays the first row of the dataframe
df.head(1)

Unnamed: 0,id,imdb_id,popularity,budget,revenue,original_title,cast,homepage,director,tagline,...,overview,runtime,genres,production_companies,release_date,vote_count,vote_average,release_year,budget_adj,revenue_adj
0,135397,tt0369610,32.985763,150000000,1513528810,Jurassic World,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,http://www.jurassicworld.com/,Colin Trevorrow,The park is open.,...,Twenty-two years after the events of Jurassic ...,124,Action|Adventure|Science Fiction|Thriller,Universal Studios|Amblin Entertainment|Legenda...,6/9/15,5562,6.5,2015,137999900.0,1392446000.0


## Data Cleaning

In [3]:
# Changing the 'release_date' column into the proper datetime format
df["release_date"] = pd.to_datetime(df["release_date"])
df["release_date"].dtype

dtype('<M8[ns]')

In [4]:
# rounding the 'popularity' column up to 2 decimal.
df["popularity"] = round(df["popularity"], 2)
df["popularity"].iloc[0]

32.99

In [5]:
# Removing columns that are not that much necessary
df.drop(["imdb_id", "homepage", "budget_adj", "revenue_adj"], axis=1, inplace=True)

# Droping 0 values from 'budget' & 'revenue' columns
df.drop(df[(df["budget"] == 0) & (df["revenue"] == 0)].index, inplace=True)

In [6]:
# Replacing columns nan values with 'missing' category value
df["tagline"].fillna("missing", inplace=True)
df["keywords"].fillna("missing", inplace=True)
df["production_companies"].fillna("missing", inplace=True)
df["cast"].fillna("missing", inplace=True)
df["director"].fillna("missing", inplace=True)
df["genres"].fillna("missing", inplace=True)
df["overview"].fillna("missing", inplace=True)

In [7]:
# function to count the genre of the movies
def count_genre(x):
    data_plot = df[x].str.cat(
        sep="|"
    )  # Concatenate strings in the Series with "|" separator
    data = pd.Series(
        data_plot.split("|")
    )  # splits the dataframe values and convert as a pandas Series object
    return data.value_counts(
        ascending=False
    )  # sorts in descending order by genre value counts


df_genre_movies = count_genre(
    "genres"
)  # calls the 'count_genre' by passing 'genres' column
df_genre_movies = (
    pd.DataFrame(df_genre_movies)  # converts to a dataframe
    .reset_index()  # resets the index
    .rename(
        columns={"index": "Drama", 0: "Count"}
    )  # renames the columns names 'index' --> 'Drama' & '0' --> 'Count'
)

## Data Visualization

In [8]:
# Pie Chart
df_year = (
    df["release_year"]
    .value_counts()  # counts of unique values in 'release_year' column
    .reset_index()  # reset index
    .sort_values(by="index", ascending=False)[
        :5
    ]  # then sort the dataframe in descending order by value counts
    .rename(columns={"index": "release_year", "release_year": "Count"})
)  # renames the columns names 'index' --> 'release_year' & 'release_year' --> 'Count'
chart = ctc.Pie(
    "Top 5 years", width="600px", height="300px"
)  # sets the plot's title as "Top 5 years"
# sets the plot's width and height
chart.set_options(
    labels=list(df_year["release_year"]), inner_radius=0
)  # sets the plot's labels and inner radius as 0
chart.add_series(list(df_year["Count"]))  # adds the 'count' column values to the plot
chart.render_notebook()  # displays the plot within the notebook

In [9]:
# Donut Chart
chart = ctc.Pie(
    "Top 5 years", width="600px", height="300px"
)  # sets the plot's title, width and height
chart.set_options(
    labels=list(df_year["release_year"]), inner_radius=0.6
)  # sets the plot's labels and inner radius as 0.6 to display as donut chart
chart.add_series(list(df_year["Count"]))  # adds the 'count' column values to the plot
chart.render_notebook()  # displays the plot within the notebook

In [10]:
from cutecharts.faker import Faker

# sets the plot's title, width and height
chart = ctc.Bar("Top Movie Genres", width="600px", height="200px")
# sets the plot's xlabel, ylabel and colors using 'Faker.colors'
# sets the plot's labels as first 7 values of 'Drama' column
chart.set_options(
    labels=list(df_genre_movies["Drama"][:7]),
    x_label="Drama",
    y_label="Count",
    colors=Faker.colors,
)
# adds the 'count' column values to the plot
chart.add_series("Genres", list(df_genre_movies["Count"][:7]))
chart.render_notebook()  # displays the plot within the notebook

In [11]:
# Line Chart
data = df.groupby("release_year").count()["id"].reset_index().tail(16)
# sets the plot's title, width and height
chart = ctc.Line(
    "Impact of Movie over the years of 20's", width="700px", height="200px"
)
# sets the plot's xlabel, ylabel and colors using 'Faker.colors'
# sets the plot's labels as values of 'release_year' column
chart.set_options(
    labels=list(data["release_year"]),
    x_label="Years",
    y_label="Count",
)
# adds the 'id' column values to the plot as 'Years'
chart.add_series("Years", list(data["id"]))
chart.render_notebook()  # displays the plot within the notebook

In [12]:
# Scatter Plot
# sets the plot's title, width and height
chart = ctc.Scatter(
    "Helps to gain insights like if movies with higher budget have high popularity",
    width="700px",
    height="200px",
)
# sets the plot's xlabel as 'Popularity'
# sets the plot's ylabel as 'Budget'
# sets color as #47B39C
# sets the plot's labels as first 7 values of 'Drama' column
chart.set_options(
    x_label="Popularity", y_label="Budget", dot_size=1, colors=["#47B39C"]
)
# adds the 'popularity' and 'budget' columns values to the plot as 'Popularity vs Budget'
chart.add_series(
    "Popularity vs Budget",
    [(items[0], items[1]) for items in list(zip(df["popularity"], df["budget"]))],
)
chart.render_notebook()  # displays the plot within the notebook