## Importing Required Libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

## Loading the Dataset

In [2]:
movies=pd.read_csv('HollywoodMovies.csv', header=0)

In [3]:
print('Order: ', movies.shape)
print('Attributes: ', movies.columns.tolist())
print('Indexes: ',min(movies.index.tolist()),' to ',max(movies.index.tolist()))
print(movies.head(10))

Order:  (970, 16)
Attributes:  ['Movie', 'LeadStudio', 'RottenTomatoes', 'AudienceScore', 'Story', 'Genre', 'TheatersOpenWeek', 'OpeningWeekend', 'BOAvgOpenWeekend', 'DomesticGross', 'ForeignGross', 'WorldGross', 'Budget', 'Profitability', 'OpenProfit', 'Year']
Indexes:  0  to  969
                                       Movie   LeadStudio  RottenTomatoes  \
0                               Spider-Man 3         Sony            61.0   
1                            Shrek the Third    Paramount            42.0   
2                               Transformers    Paramount            57.0   
3   Pirates of the Caribbean: At World's End       Disney            45.0   
4  Harry Potter and the Order of the Phoenix  Warner Bros            78.0   
5                                I Am Legend  Warner Bros            69.0   
6                       The Bourne Ultimatum    Universal            93.0   
7         National Treasure: Book of Secrets       Disney            31.0   
8                    Alv

In [5]:
#create columns
columns=movies.columns.tolist()

## Printing Summary

In [6]:
print(movies.describe())
# statistical summary

       RottenTomatoes  AudienceScore  TheatersOpenWeek  OpeningWeekend  \
count      913.000000     907.000000        949.000000      969.000000   
mean        51.707558      61.271224       2495.246575       20.620826   
std         26.821268      16.602457       1164.223356       25.338689   
min          0.000000      19.000000          1.000000        0.010000   
25%         28.000000      49.000000       2054.000000        5.300000   
50%         52.000000      61.000000       2798.000000       13.150000   
75%         75.000000      74.000000       3285.000000       26.200000   
max         99.000000      96.000000       4468.000000      207.440000   

       BOAvgOpenWeekend  DomesticGross  ForeignGross   WorldGross      Budget  \
count        945.000000     970.000000    876.000000   914.000000  897.000000   
mean        8562.593651      68.162544    101.237419   169.009945   56.117168   
std        10645.135565      80.409969    155.987696   227.744380   53.755914   
min      

In [None]:
print(movies.info())
# logical summary

## Check Missing Values

In [10]:
print(movies.isna().sum())

Movie               0
LeadStudio          0
RottenTomatoes      0
AudienceScore       0
Story               0
Genre               0
TheatersOpenWeek    0
OpeningWeekend      0
BOAvgOpenWeekend    0
DomesticGross       0
ForeignGross        0
WorldGross          0
Budget              0
Profitability       0
OpenProfit          0
Year                0
dtype: int64


## Cleaning Missing Values

In [8]:
for column in columns:
    if((movies[column].dtype=='float64') and (movies[column].isna().sum()>0)):
        movies[column].fillna(movies[column].mean(), inplace=True)
# filling missing floats with mean

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  movies[column].fillna(movies[column].mean(), inplace=True)


In [9]:
movies.dropna(inplace=True, axis=0)
# dropping string nas and year nas

In [11]:
print('Order: ', movies.shape)
# check order

Order:  (630, 16)


## Plot Analysis

#### Histogram

In [None]:
plt.hist(movies['RottenTomatoes'], bins=40, label='Rotten Tomatoes Score')
plt.title('Rotten Tomatoes Score Distribution')
plt.show()
# histogram showing distribution of rotten tomatoes scores

In [None]:
plt.hist(movies['Profitability'], bins=40, label='Profitability')
plt.title('Profitability Distribution')
plt.show()
# histogram showing distribution of profitabilty

#### Countplot

In [None]:
plt.figure(figsize=(15,15))
sns.countplot(data=movies, x='Genre')
plt.show()
# number of movies in each genre

#### Scatterplot

In [None]:
plt.figure(figsize=(15,15))
sns.scatterplot(x=movies['Budget'], y=movies['WorldGross'])
plt.xlabel('Budget')
plt.ylabel('Gross')
plt.title('Gross v/s Budget')
plt.show()
# plot for gross relation with budget

#### Hexbinplot

In [None]:
plt.figure(figsize=(10,10))
sns.jointplot(kind='hex', x=movies['Budget'], y=movies['WorldGross'], color='black')
plt.xlabel('Budget')
plt.ylabel('Gross')
plt.title('Gross v/s Budget')
plt.tight_layout()
plt.show()
# plot for gross relation with budget

## Group Analysis

In [None]:
genre_gross=movies.groupby('Genre')['WorldGross'].mean().tolist()
genres=movies['Genre'].drop_duplicates().sort_values().tolist()
plt.figure(figsize=(15,10))
plt.bar(x=genres, height=genre_gross)
plt.ylabel('Avg Gross')
plt.title('Avg Gross by Genre')
plt.show()
# genre mean grosses

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(x='Genre', y='AudienceScore', data=movies)
plt.xticks(rotation=90)
plt.title('Audience Score by Genre')
plt.show()
# audience scores by genre

## Removal of Outliers

In [None]:
sns.boxplot(x=movies['Profitability'])
plt.show()
# check if outliers present

In [None]:
Q1,Med,Q3=np.percentile(movies['Profitability'],[25,50,75])
IQR=Q3-Q1
minimum=Q1-1.5*IQR
maximum=Q3+1.5*IQR
movies=movies[(movies['Profitability']>=minimum) & (movies['Profitability']<=maximum)]