# Visualizing IMDb Top 1000 Movies Dataset

##### Requirements step (Uncomment the below code to install missing libary if not using jupyter or anaconda installation)

In [None]:
# !pip install pandas numpy matplotlib

#### Import the libraries

In [None]:
import pandas as pd  # data preprocessing
import numpy as np   # mathematical computation
import matplotlib.pyplot as plt  # visualization

#### Read the dataset

In [None]:
df = pd.read_csv('../data/processed/imdb_top_1000.csv')
print(type(df))  # df is a dataframe
df.head(3)

In [None]:
df.shape
# rows = 1000,columns = 16

In [None]:
df.columns

#### Data Preprocessing

#### 1) Handle the Null Values

In [None]:
a = df.isnull().sum()
print(a.index)
print(a.values)

In [None]:
def gen_null_count_per(data):
    #     nv = nll value
    nv = data.isnull().sum()
    nv = nv[nv > 0]  # filter all columns where the data contains null values
    nv_df = pd.DataFrame({'Feature': nv.index, 'Count': nv.values,
                         'Percent_Null': (nv.values/df.shape[0])*100})
    return nv_df

In [None]:
gen_null_count_per(df)

In [None]:
df.dtypes

In [None]:
df['Certificate'].value_counts()

In [None]:
df['Certificate'].fillna(df['Certificate'].mode()[0], inplace=True)
# df.head()

In [None]:
gen_null_count_per(df)

In [None]:
df.describe()

In [None]:
df['Meta_score'].fillna(df['Meta_score'].median(), inplace=True)
gen_null_count_per(df)

In [None]:
df.dtypes

In [None]:
# Released_Year, Gross

In [None]:
df['Released_Year'].unique()

In [None]:
df[df['Released_Year'] == 'PG'].shape

In [None]:
df = df[df['Released_Year'] != 'PG']
df.shape

In [None]:
df['Released_Year'] = pd.to_numeric(df['Released_Year'])
df.dtypes

In [None]:
df[['Gross']].head()

In [None]:
df['Gross'] = df['Gross'].str.replace(',', '')
df[['Gross']].head()

In [None]:
df['Gross'] = df['Gross'].astype(int)
df.dtypes

In [None]:
df['Gross'].fillna(df['Gross'].median(), inplace=True)
gen_null_count_per(df)

#### Separate the Continuous and Catgeorical Features

In [None]:
num_cols = df.dtypes[df.dtypes != 'object'].index
cat_cols = df.dtypes[df.dtypes == 'object'].index
print(num_cols)
print(cat_cols)

#### Handle the duplicated records

In [None]:
df.duplicated().sum()

### EDA

#### Plot Top N most frequent genres on a bar chart in IMDB top 1000 dataset

In [None]:
def topN_most_freq_genre(N, col):
    s = df['Genre'].value_counts().sort_values(ascending=False).head(N)
    plt.barh(s.index, s.values, color=col, edgecolor='black')
    plt.title(f'Top {N} most frequent genres in IMDB dataset')
    plt.ylabel('Genres')
    plt.xlabel('Count')
    plt.show()

In [None]:
topN_most_freq_genre(7, 'maroon')

#### 2) Plot Top 5 movies with most votes on a line chart

In [None]:
q2 = df[['Series_Title', 'No_of_Votes']].sort_values(
    'No_of_Votes', ascending=False).head()
q2

In [None]:
# fig, ax = plt.subplots()

# We need to draw the canvas, otherwise the labels won't be positioned and
# won't have values yet.
# fig.canvas.draw()

# labels = [item.get_text() for item in ax.get_xticklabels()]
# labels[1] = 'Testing'

# ax.set_xticklabels([100,200,300,400,450,500])

# plt.show()

In [None]:
# a = 2.3e6
# print(a)
# 2300k

In [None]:
fig, ax = plt.subplots()

ax.plot(q2['Series_Title'], q2['No_of_Votes'], color='tan',
        marker='o', mfc='maroon', lw=3, markersize=7)
ax.set_title(f'Top 5 most frequent movies with highest number of votes')
ax.set_xlabel('Series Name')
ax.set_ylabel('Number of Votes')
ax.set_xticklabels(['Shawshank_Red.', 'Dark Knight',
                   'Inception', 'Fight Club', 'Pulp Fiction'])
ax.set_yticklabels(['0', '1900K', '2000K', '2100K', '2200K', '2300K', '2400K'])

# plt.grid()
plt.show()

#### 3) Visualize the relationship between the number of votes and Gross revenue

In [None]:
plt.scatter(df['No_of_Votes'], df['Gross'])
plt.title('Num_of_Votes vs Gross Revenue')
plt.xlabel('No_of Votes')
plt.ylabel('Gross Revenue')
plt.show()

In [None]:
df[['Runtime']].head()

In [None]:
df['Runtime1'] = df['Runtime'].str.replace(' min', '')
df['Runtime1'] = pd.to_numeric(df['Runtime1'])
df.dtypes

In [None]:
df['Runtime(hr)'] = round(df['Runtime1']/60, 2)
df.head()

#### 5) For the top 3 Genres, Depict top N directors for each Genre with highest Gross Revenue on bar chart 

In [None]:
df['Genre'].value_counts()
w = df[df['Genre'].isin(['Drama', 'Drama, Romance', 'Comedy, Drama'])]
w.shape

In [None]:
x = w[w['Genre'] == 'Drama']
x1 = x[['Director', 'Gross']].sort_values('Gross', ascending=False).head()
x1

In [None]:
def topN_directors_with_highest_Gross(N):
    w1 = w[w['Genre'] == 'Drama']
    w1res = w1[['Director', 'Gross']].sort_values(
        'Gross', ascending=False).head(N)
    w2 = w[w['Genre'] == 'Drama, Romance']
    w2res = w2[['Director', 'Gross']].sort_values(
        'Gross', ascending=False).head(N)
    w3 = w[w['Genre'] == 'Comedy, Drama']
    w3res = w3[['Director', 'Gross']].sort_values(
        'Gross', ascending=False).head(N)
    fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(6, 15))

    ax1.bar(w1res['Director'], w1res['Gross'],
            color='orange', edgecolor='black')
    ax1.set_title(f'Top {N} directors with highest Gross Rev in Drama Genre')

    ax2.bar(w2res['Director'], w2res['Gross'],
            color='lightgreen', edgecolor='black')
    ax2.set_title(
        f'Top {N} directors with highest Gross Rev in Drama, Romance Genre')

    ax3.bar(w3res['Director'], w3res['Gross'], color='blue', edgecolor='black')
    ax3.set_title(
        f'Top {N} directors with highest Gross Rev in Comedy, Drama Genre')

    plt.show()

In [None]:
topN_directors_with_highest_Gross(4)

In [None]:
for i in ['Drama', 'Drama, Romance', 'Comedy, Drama']:
    x = df[df['Genre'] == i]
    x1 = x[['Director', 'Gross']].sort_values('Gross', ascending=False).head(4)
    print(x1)

In [None]:
num_cols

In [None]:
plt.subplot(1, 3, 1)
plt.boxplot(x=df['Released_Year'])

plt.subplot(1, 3, 2)
plt.boxplot(x=df['IMDB_Rating'])

plt.subplot(1, 3, 3)
plt.boxplot(x=df['Meta_score'])

plt.show()

#### Depict boxplot for each numerical feature

In [None]:
plt.figure(figsize=(10, 8))
for i in range(len(num_cols)):
    plt.subplot(2, 3, i+1)
    plt.boxplot(x=df[num_cols[i]])
    plt.title(f'Boxplot for {num_cols[i]}')
plt.show()