# Movie Gross Revenue Analysis
### Data Cleaning

In [2]:
# import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
# Load datasets
df1 = pd.read_csv('bom.movie_gross.csv')
df2 = pd.read_csv('rt.movie_info.tsv', sep='\t')

In [4]:
df1.head()


Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
0,Toy Story 3,BV,415000000.0,652000000,2010
1,Alice in Wonderland (2010),BV,334200000.0,691300000,2010
2,Harry Potter and the Deathly Hallows Part 1,WB,296000000.0,664300000,2010
3,Inception,WB,292600000.0,535700000,2010
4,Shrek Forever After,P/DW,238700000.0,513900000,2010


In [5]:
df2.head()

Unnamed: 0,id,synopsis,rating,genre,director,writer,theater_date,dvd_date,currency,box_office,runtime,studio
0,1,"This gritty, fast-paced, and innovative police...",R,Action and Adventure|Classics|Drama,William Friedkin,Ernest Tidyman,"Oct 9, 1971","Sep 25, 2001",,,104 minutes,
1,3,"New York City, not-too-distant-future: Eric Pa...",R,Drama|Science Fiction and Fantasy,David Cronenberg,David Cronenberg|Don DeLillo,"Aug 17, 2012","Jan 1, 2013",$,600000.0,108 minutes,Entertainment One
2,5,Illeana Douglas delivers a superb performance ...,R,Drama|Musical and Performing Arts,Allison Anders,Allison Anders,"Sep 13, 1996","Apr 18, 2000",,,116 minutes,
3,6,Michael Douglas runs afoul of a treacherous su...,R,Drama|Mystery and Suspense,Barry Levinson,Paul Attanasio|Michael Crichton,"Dec 9, 1994","Aug 27, 1997",,,128 minutes,
4,7,,NR,Drama|Romance,Rodney Bennett,Giles Cooper,,,,,200 minutes,


In [6]:
print(df1.shape)
print(df2.shape)

(3387, 5)
(1560, 12)


#### The dataset has  3387 rows and 5 columns

In [7]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3387 entries, 0 to 3386
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           3387 non-null   object 
 1   studio          3382 non-null   object 
 2   domestic_gross  3359 non-null   float64
 3   foreign_gross   2037 non-null   object 
 4   year            3387 non-null   int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 132.4+ KB


In [8]:
df1.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
domestic_gross,3359.0,28745850.0,66982500.0,100.0,120000.0,1400000.0,27900000.0,936700000.0
year,3387.0,2013.958,2.478141,2010.0,2012.0,2014.0,2016.0,2018.0


### Dealing with df1

In [9]:
# Chwck missing values in df1
missing_values = df1.isnull().sum()
missing_values

title                0
studio               5
domestic_gross      28
foreign_gross     1350
year                 0
dtype: int64

In [10]:
# Identify the movies with missing studios
missing_studio_movies = df1[df1['studio'].isnull()][['title', 'year']]
print("Movies needing studio research")
display(missing_studio_movies)

Movies needing studio research


Unnamed: 0,title,year
210,Outside the Law (Hors-la-loi),2010
555,Fireflies in the Garden,2011
933,Keith Lemon: The Film,2012
1862,Plot for Peace,2014
2825,Secret Superstar,2017


In [11]:
# update missing studios with information from the internet
studio_updates = {
    'Outside the Law (Hors-la-loi)': 'StudioCanal',
    'Fireflies in the Garden': 'Senator Entertainment Inc',
    'Keith Lemon: The Film': 'Starz Entertainment',
    'Plot for Peace': 'Caramel Film',
    'Secret Superstar': 'Zee Studios'
}

# Update studio
for title, studio in studio_updates.items():
    df1.loc[df1['title'] == title, 'studio'] = studio

In [12]:
# Drop foreign_gross 
cleaned_df1 =  df1.drop(columns=["foreign_gross"])

In [13]:
# Drop misssing rows in domestic gross
cleaned_df1 = cleaned_df1.dropna(subset=['domestic_gross'])


In [14]:
null_values = cleaned_df1.isnull().sum()
null_values

title             0
studio            0
domestic_gross    0
year              0
dtype: int64

In [15]:
# Check for fully duplicated rows
df1.duplicated().sum()  

0

#### There are no duplicates in the dataset

### df2

In [16]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1560 entries, 0 to 1559
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            1560 non-null   int64 
 1   synopsis      1498 non-null   object
 2   rating        1557 non-null   object
 3   genre         1552 non-null   object
 4   director      1361 non-null   object
 5   writer        1111 non-null   object
 6   theater_date  1201 non-null   object
 7   dvd_date      1201 non-null   object
 8   currency      340 non-null    object
 9   box_office    340 non-null    object
 10  runtime       1530 non-null   object
 11  studio        494 non-null    object
dtypes: int64(1), object(11)
memory usage: 146.4+ KB


In [22]:
missing_values = df2.isnull().sum()
missing_values

id                 0
synopsis           0
rating             3
genre              8
director           0
writer             0
theater_date     359
dvd_date         359
currency        1220
box_office      1220
runtime           30
studio             0
dtype: int64

In [24]:
#Fill studio, director, writer, synopsis, theater_date,and dvd_date with unknown
df2['studio'] = df2['studio'].fillna('Unknown')


In [32]:
#fill in director and writer 
df2['director'].fillna('unknown', inplace=True)
df2['writer'].fillna('unknown', inplace=True)
df2['synopsis'].fillna('unknown', inplace=True)
df2['writer'].fillna('unknown', inplace=True)
df2['theater_date'].fillna('unknown', inplace=True)
df2['dvd_date'].fillna('unknown', inplace=True)
df2['studio'] = df2['studio'].fillna('Unknown')

missing_values

id                 0
synopsis           0
rating             3
genre              8
director           0
writer             0
theater_date     359
dvd_date         359
currency        1220
box_office      1220
runtime           30
studio             0
dtype: int64

In [35]:
# fill rating with mode
# Calculate the mode of the 'rating' column
rating_mode = df2['rating'].mode()[0]  # [0] takes the first mode if multiple exist


# Fill missing values with the mode
df2['rating'] = df2['rating'].fillna(rating_mode)
rating_mode

'R'

In [37]:
missing_values = df2.isnull().sum()
missing_values

id                 0
synopsis           0
rating             0
genre              8
director           0
writer             0
theater_date       0
dvd_date           0
currency        1220
box_office      1220
runtime           30
studio             0
dtype: int64

In [40]:
# Get rows where 'genre' is NaN/None
nan_genres = df2[df2['genre'].isna()]
print("Rows with NaN genres:")
print(nan_genres)

Rows with NaN genres:
        id                                           synopsis rating genre  \
10      17                                            unknown      R   NaN   
131    167                                            unknown      R   NaN   
222    289                                            unknown     NR   NaN   
250    327  When a new robot, Raymond, defeats the three h...     NR   NaN   
658    843  Miners want to drill for billions of dollars w...     NR   NaN   
1082  1393  Steven Seagal plays an expert sniper on a spec...      R   NaN   
1342  1736                                            unknown     NR   NaN   
1543  1982                                            unknown      R   NaN   

           director         writer theater_date dvd_date currency box_office  \
10          unknown        unknown      unknown  unknown      NaN        NaN   
131         unknown        unknown      unknown  unknown      NaN        NaN   
222         unknown        unknown 