# Predicting a film’s gross revenue

## 1. Load the packages

In [None]:
# Data processing packages
import numpy as np
import pandas as pd
from collections import Counter

# Machine learning packages
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MultiLabelBinarizer

# Visualization packages
import seaborn as sns
import matplotlib.pyplot as plt

# Others
import time

### 2.1 Read the data

In [None]:
# Use pandas to load into a DataFrame
# Y1. csv doesn’t have a header so
# add one when loading the file
X1 = pd.read_csv("X1.csv")
Y1 = pd.read_csv("Y1.csv", header=None, names=['revenue '])

In [None]:
X1.drop(['Unnamed: 0'], axis=1, inplace=True)

In [None]:
X1.columns

In [None]:
X1.head()

In [None]:
Y1.head()

### 2.2 Numerical and categorical features

In [None]:
X1.info()

In [None]:
numeric_features = X1.select_dtypes(include="number").columns.tolist()
non_numeric_features = X1.select_dtypes(exclude="number").columns.tolist()
embedding_features = ['img_url', 'description', 'img_embeddings', 'text_embeddings']

In [None]:
numeric_features.remove('is_adult')
categorical_features = non_numeric_features.copy()
[categorical_features.remove(col) for col in embedding_features]
categorical_features.append('is_adult')

In [None]:
numeric_features, categorical_features, embedding_features

In [None]:
X1_num, X1_cat, X1_embed = X1.loc[:, numeric_features], X1.loc[:, categorical_features], X1.loc[:, embedding_features]

In [None]:
X1_num.head()

In [None]:
X1_cat.head()

In [None]:
X1_embed.head()

### 2.3 Data visualization

In [None]:
sns.pairplot(X1, diag_kind="kde")

#### 2.3.1 Revenue histogram

In [None]:
sns.set_style("whitegrid")
sns.displot(data=Y1, x='revenue ', kind='kde')
plt.show()

In [None]:
# log revenue
Y1['log_revenue'] = np.log1p(Y1['revenue ']) # using natural log of (1 + input), when input is null, the value will be 0

In [None]:
Y1.head()

In [None]:
fig, ax = plt.subplots(figsize = (16, 6))
plt.subplot(1, 2, 1)
sns.histplot(Y1['revenue '], kde=True)
plt.title('Distribution of revenue')
plt.subplot(1, 2, 2)
sns.histplot(Y1['log_revenue'], kde=True)
plt.title('Distribution of log transformation of revenue')
plt.show()

In [None]:
print("There isn\'t null value for the revenue") if np.sum(Y1['log_revenue'] == 0) == 0 else print("There exists null value for the revenue")

#### 2.3.2 Revenue v.s. ratings

In [None]:
plt.figure(figsize=(16, 8))
plt.subplot(1, 2, 1)
plt.scatter(X1_num['ratings'], Y1['revenue '])
plt.title('Revenue vs ratings fig(1)')
plt.xlabel('Ratings')
plt.ylabel('Revenue')
plt.subplot(1, 2, 2)
plt.scatter(X1_num['ratings'], Y1['log_revenue'])
plt.title('Log revenue vs ratings fig(2)')
plt.xlabel('Ratings')
plt.ylabel('$log(1 + Revenue)$')

In [None]:
print("There isn\'t null value for the ratings") if np.sum(np.log1p(X1_num['ratings']) == 0) == 0 else print("There exists null value for the ratings")

#### 2.3.3 Revenue v.s. votes

In [None]:
X1_num['log_votes'] = np.log1p(X1_num['n_votes'])

plt.figure(figsize=(16, 8))
plt.subplot(1, 2, 1)
plt.scatter(X1_num['n_votes'], Y1['revenue '])
plt.title('Revenue vs votes fig(1)')
plt.xlabel('n_votes')
plt.ylabel('Revenue')
plt.subplot(1, 2, 2)
plt.scatter(X1_num['log_votes'], Y1['log_revenue'])
plt.title('Log revenue vs log votes fig(2)')
plt.xlabel('$log(1 + n\_votes)$')
plt.ylabel('$log(1 + Revenue)$')

In [None]:
print("There isn\'t null value for the votes") if np.sum(X1_num['log_votes'] == 0) == 0 else print("There exists null value for the votes")

#### 2.3.4 Revenue v.s. production year

In [None]:
X1_num['production_year'].unique()

In [None]:
plt.figure(figsize=(16, 8))

sns.scatterplot(X1_num['production_year'], Y1['revenue '], c=X1_num['production_year'])
plt.title('Revenue vs votes fig(1)')
plt.xlabel('production_year')
plt.ylabel('Revenue')

#### 2.3.5 Revenue v.s. release year


In [None]:
X1_num['release_year'].unique()

In [None]:
plt.figure(figsize=(16, 8))

sns.scatterplot(X1_num['release_year'], Y1['revenue '], c=X1_num['release_year'])
plt.title('Revenue vs votes fig(1)')
plt.xlabel('release_year')
plt.ylabel('Revenue')

#### 2.3.6 Revenue v.s. title

In [None]:
for i in X1_cat['title']:
    if len(i) <= 4:
        print(i)

There is no null value for title

## 3. Data Engineering

### 3.0 Missing Value

In [None]:
X1_cat['genres'].head()

In [None]:
np.sum(X1_cat['genres'] == "\\N")

In [None]:
# removing the rows with null value
remove_id = X1_cat[X1_cat['genres'] == "\\N"].index
# X1_cat.drop(remove_id, axis=0, inplace=True)

In [None]:
X1_cat.loc[X1_cat['genres'] == "\\N", "genres"] = "Others"

In [None]:
X1_cat

### 3.2 Categorical data process

`title`, `runtime`, `genres`, `studio` and `is_adult` columns

In [None]:
X1_cat.head()

### 3.2.1 Run Time processing

Use the median value to replace the data with "\\N" for the `runtime` column, and move it to the numerical data.

In [None]:
X1_cat['runtime'].describe()

In [None]:
np.sum(X1_cat['runtime'] == '\\N')

In [None]:
X1_cat.head()

In [None]:
median_runtime = np.median(X1_cat.loc[X1_cat['runtime'] != '\\N', 'runtime'].astype(np.int64))

In [None]:
X1_cat['runtime'] = np.where(X1_cat['runtime'] == '\\N', median_runtime, X1_cat['runtime']).astype(np.int64)

In [None]:
X1_num['runtime'] = X1_cat['runtime']

In [None]:
X1_cat.drop(['runtime'], axis=1, inplace=True)

In [None]:
X1_num.head()

In [None]:
X1_cat.head()

### 3.2.2 Studio column processing

The `studio` column will be transfomed as frequency of studio that appears in this dataset, and move it to numerical dataset.

There is not nan values in `studio` column

In [None]:
X1_cat['studio'].value_counts()

In [None]:
def catToFrequency(dataset, column_name, inplace=False):
    column = dataset[column_name]
    counts = Counter(column)

    dict_cat_freq = {}

    for category, freq in counts.most_common():
        dict_cat_freq[category] = freq / len(column)

    new_column = column.apply(lambda x: dict_cat_freq[x])

    if inplace:
        dataset[column_name] = new_column
        return
    else:
        return new_column

In [None]:
X1_cat['studio'] = catToFrequency(X1_cat, 'studio')

In [None]:
X1_num['studio_freq'] = X1_cat['studio']

In [None]:
X1_num.head()

In [None]:
X1_cat.drop(['studio'], axis=1, inplace=True)

In [None]:
X1_cat.head()

### Genres processing

There're 4 rows do not have genres, we just remove these 4 rows.

In [None]:
X1_cat['genres'].head()

In [None]:
np.sum(X1_cat['genres'] == "\\N")

In [None]:
# removing the rows with null value
X1_cat[X1_cat['genres'] == "\\N"].index

In [None]:
generes_list = []
for row in X1_cat['genres']:
    for element in row.split(','):
        generes_list.append(element)

In [None]:
set(generes_list)

In [None]:
X1_cat['genres_split'] = X1_cat['genres'].apply(lambda x: x.split(","))

In [None]:
mlb = MultiLabelBinarizer()
genere_encoder = pd.DataFrame(mlb.fit_transform(X1_cat['genres_split']))
genere_encoder.columns = mlb.classes_.tolist()

In [None]:
genere_encoder

In [None]:
X1_cat

In [None]:
X1_cat = pd.concat([X1_cat, genere_encoder], axis=1).drop(['genres', 'genres_split'], axis=1)

In [None]:
X1_cat