In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from xgboost import XGBRegressor

df = pd.read_csv('boxoffice.csv')
df.tail()


Unnamed: 0,title,domestic_revenue,world_revenue,distributor,opening_revenue,opening_theaters,budget,MPAA,genres,release_days
2689,X-Men,38153242,47601900,Disney,140965221,3532,106955513,G,Comedy,92
2690,The Matrix,63305093,252473529,Disney,190634982,3171,111238142,PG-13,Comedy,156
2691,Forrest Gump,271758510,524896297,Disney,112771730,2450,112364620,NC-17,Drama,146
2692,Jaws,268259149,791515453,Disney,194172443,661,172229703,G,Thriller,170
2693,Interstellar,158336835,77904664,Universal,132903825,3914,93959844,PG-13,Comedy,170


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2694 entries, 0 to 2693
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   title             2694 non-null   object
 1   domestic_revenue  2694 non-null   int64 
 2   world_revenue     2694 non-null   int64 
 3   distributor       2694 non-null   object
 4   opening_revenue   2694 non-null   int64 
 5   opening_theaters  2694 non-null   int64 
 6   budget            2694 non-null   int64 
 7   MPAA              2694 non-null   object
 8   genres            2694 non-null   object
 9   release_days      2694 non-null   int64 
dtypes: int64(6), object(4)
memory usage: 210.6+ KB


In [None]:
df.shape

(2694, 10)

In [None]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
domestic_revenue,2694.0,150853900.0,85799970.0,827765.0,76732720.0,152572473.5,224731400.0,299810700.0
world_revenue,2694.0,771917700.0,428441400.0,3261301.0,399113500.0,766555142.5,1143920000.0,1499310000.0
opening_revenue,2694.0,99713210.0,57217620.0,138569.0,50338480.0,99814175.0,148980500.0,199933600.0
opening_theaters,2694.0,2263.039,1298.559,10.0,1161.0,2271.5,3392.25,4499.0
budget,2694.0,153799600.0,85576240.0,5197977.0,78618760.0,156256716.0,227483200.0,299859700.0
release_days,2694.0,90.89755,50.89404,1.0,47.0,91.0,135.0,179.0


In [None]:
df.describe()

Unnamed: 0,domestic_revenue,world_revenue,opening_revenue,opening_theaters,budget,release_days
count,2694.0,2694.0,2694.0,2694.0,2694.0,2694.0
mean,150853900.0,771917700.0,99713210.0,2263.039347,153799600.0,90.89755
std,85799970.0,428441400.0,57217620.0,1298.55914,85576240.0,50.894041
min,827765.0,3261301.0,138569.0,10.0,5197977.0,1.0
25%,76732720.0,399113500.0,50338480.0,1161.0,78618760.0,47.0
50%,152572500.0,766555100.0,99814180.0,2271.5,156256700.0,91.0
75%,224731400.0,1143920000.0,148980500.0,3392.25,227483200.0,135.0
max,299810700.0,1499310000.0,199933600.0,4499.0,299859700.0,179.0


In [None]:
#dropping some columns
df.drop(['world_revenue', 'opening_revenue'], axis=1, inplace=True)
df.head()

Unnamed: 0,title,domestic_revenue,distributor,opening_theaters,budget,MPAA,genres,release_days
0,The Avengers,6026491,Warner Bros.,253,174687337,R,Animation,16
1,Titanic,169411543,Disney,122,103948486,G,Action,103
2,Jurassic Park,107836098,Sony,3826,122104991,NC-17,Horror,89
3,Avatar,51433697,Disney,3868,46431596,G,Horror,85
4,The Lion King,142791649,Warner Bros.,2934,203513696,R,Comedy,158


In [None]:
df.shape

(2694, 8)

In [None]:
#checking for null values
df.isnull().sum()


title               0
domestic_revenue    0
distributor         0
opening_theaters    0
budget              0
MPAA                0
genres              0
release_days        0
dtype: int64

In [None]:
# filling null values with mode
for col in ['MPAA', 'genres']:
    df[col] = df[col].fillna(df[col].mode()[0])
    
df.isnull().sum().sum()    


np.int64(0)

In [None]:
# Converting 'release_date' to datetime
df['domestic_revenue'] = df['domestic_revenue'].astype(str).str[1:]

for col in ['domestic_revenue', 'opening_theaters', 'release_days']:
    df[col] = df[col].astype(str).str.replace(',', '')

    # Selecting rows with no null values
    # in the columns on which we are iterating.
    temp = (~df[col].isnull())
    df[temp][col] = df[temp][col].convert_dtypes(float)

    df[col] = pd.to_numeric(df[col], errors='coerce')