In [None]:
# Install requirements
!pip install pandas
!pip install seaborn
!pip install matplotlib
!pip install wordcloud
!pip install scikit-learn

In [None]:
# Load the dataset into the kernel and display the top 5 data
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import time
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
df = pd.read_csv("BoxOfficePrediction/train.csv")
df.head()

In [None]:
# Show the columns from the data set
df.columns

In [None]:
# Find columns with non-numeric features
non_numeric_columns = df.select_dtypes(exclude=['float64', 'int64']).columns
print("Non-numeric columns:", non_numeric_columns)

In [None]:
# Scatterplot to find correlation between revenue and budget
plt.figure(figsize=(16, 8))
plt.subplot(1, 2, 1)
plt.scatter(df['budget'], df['revenue'])
plt.title('Revenue vs budget fig(1)');

In [None]:
# Categorical scatterplot to determine if movies with official websites have higher revenue
df['has_homepage'] = 0
df.loc[df['homepage'].isnull() == False, 'has_homepage'] = 1 # a number 1 means it has home page
sns.catplot(x='has_homepage', y='revenue', data=df);
plt.title('Revenue for movie with and w/o homepage');

In [None]:
#let's find top words from movie Titles
start = time.time()
plt.figure(figsize = (12, 12))
token_title = ' '.join(df['original_title'].values) #create split to title by sprace to extract the text.
#bg color set to white for good contrast, by default bg color is darker
wordcloud = WordCloud(max_font_size=None, background_color='white', width=1200, height=1000).generate(token_title)
plt.imshow(wordcloud)
plt.title('Top words from movie titles ')
plt.axis("off") # we dont need axes for this
plt.show()
print(" Time taken to complete this operation is", time.time() - start, 'seconds')

In [None]:
# It is time to train the model with the numeric columns
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
df_train=df.select_dtypes(include=numerics)
df_train.drop(columns=['id'],inplace=True)
df_train=df_train.fillna(df_train.median()) # Fills the empty values with median of the data set

In [None]:
# Load the training set
X = df_train.drop(['revenue'], axis=1)
y= df_train['revenue'] #prediction
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

In [None]:
# Let's attempt to use linear regression
lm = LinearRegression()
lm.fit(X_train, y_train)
lm_preds = lm.predict(X_test)
print("R Square: ", r2_score(y_test, lm_preds))

In [None]:
# Let's also attempt with random forrest
import sklearn.metrics as metrics
from sklearn.ensemble import RandomForestRegressor
RF_model = RandomForestRegressor(random_state =0, n_estimators=500, max_depth=10)
RF_model.fit(X_train, y_train)
y_hat = RF_model.predict(X_test)
print ("R-Squared is:", metrics.r2_score(y_hat, y_test))

In [None]:
# What features does random forrest think have the most weight?
import numpy as np
importances = pd.DataFrame({'feature':X_train.columns,'importance':np.round(RF_model.feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False).set_index('feature');
print(importances)
importances.plot.bar();

In [None]:
# Linear regression predictions for the test data
revenue_predictions = lm.predict(X_test)
gbr_predictions = pd.DataFrame(revenue_predictions, columns = ['predicted_revenue'])
gbr_predictions.head()

In [None]:
# Random forrest predictions for the test data
revenue_predictions = RF_model.predict(X_test)
gbr_predictions = pd.DataFrame(revenue_predictions, columns = ['predicted_revenue'])
gbr_predictions.head()