# Predicting a film’s gross revenue

## 1. Load the packages

In [1]:
import pandas as pd
from collections import Counter

## 2. Data at first glance

## 2.1 Read the data

In [2]:
# Use pandas to load into a DataFrame
# Y1. csv doesn ’ t have a header so
# add one when loading the file
X1 = pd.read_csv("X1.csv")
Y1 = pd.read_csv("Y1.csv", header=None, names=['revenue '])

In [3]:
X1.head()

Unnamed: 0.1,Unnamed: 0,title,img_url,description,ratings,n_votes,is_adult,production_year,runtime,genres,release_year,studio,img_embeddings,text_embeddings
0,2502,Letters to Juliet,https://m.media-amazon.com/images/M/MV5BMjg0OT...,Letters to Juliet: Directed by Gary Winick. Wi...,6.5,92937.0,0,2010,105,"Adventure,Comedy,Drama",2010.0,Sum.,"[0.25030804, 2.4058464, 1.0431569, 0.030648155...","[-0.6795498, 0.35658365, 0.9994932, -0.9793934..."
1,6238,Veil of Tears,https://m.media-amazon.com/images/M/MV5BZjMxOD...,Veil of Tears: Directed by William Gereghty. W...,7.9,11.0,0,1996,\N,"Action,Crime,Drama",2014.0,WF,"[0.51250213, 2.8152602, 0.46308166, 0.29031387...","[-0.6202415, 0.31657028, 0.9992422, -0.9703722..."
2,1800,International Velvet,https://m.media-amazon.com/images/M/MV5BOGVkYj...,International Velvet: Directed by Bryan Forbes...,5.9,1345.0,0,1978,127,"Drama,Family,Sport",1978.0,MGM,"[0.18073043, 0.24735461, 0.63652813, 0.2496522...","[-0.709996, 0.4233521, 0.99980927, -0.98892415..."
3,2675,8 Seconds,https://m.media-amazon.com/images/M/MV5BYjY4Nz...,8 Seconds: Directed by John G. Avildsen. With ...,6.6,4851.0,0,1994,105,"Biography,Drama,Sport",1994.0,NL,"[0.025015268, 0.9105338, 0.3878257, 0.3421247,...","[-0.7416838, 0.38435012, 0.9998453, -0.9874693..."
4,3674,Penitentiary II,https://m.media-amazon.com/images/M/MV5BNjQyZW...,Penitentiary II: Directed by Jamaa Fanaka. Wit...,4.1,549.0,0,1982,108,"Crime,Drama,Sport",1982.0,MGM,"[0.19079691, 1.9068279, 0.29114372, 0.19527505...","[-0.65501904, 0.3845747, 0.9996712, -0.9766391..."


# 2.2 Data visualization

## 3. Data Engineering

### 3.1. Numerical and categorical features

In [4]:
categorical_features = X1.select_dtypes(exclude="number").columns.tolist()
numeric_features = X1.select_dtypes(include="number").columns.tolist()

In [5]:
X1_num, X1_cat = X1.loc[:, numeric_features], X1.loc[:, categorical_features]

In [7]:
X1_num.head()

Unnamed: 0.1,Unnamed: 0,ratings,n_votes,is_adult,production_year,release_year
0,2502,6.5,92937.0,0,2010,2010.0
1,6238,7.9,11.0,0,1996,2014.0
2,1800,5.9,1345.0,0,1978,1978.0
3,2675,6.6,4851.0,0,1994,1994.0
4,3674,4.1,549.0,0,1982,1982.0


In [8]:
X1_cat.head()

Unnamed: 0,title,img_url,description,runtime,genres,studio,img_embeddings,text_embeddings
0,Letters to Juliet,https://m.media-amazon.com/images/M/MV5BMjg0OT...,Letters to Juliet: Directed by Gary Winick. Wi...,105,"Adventure,Comedy,Drama",Sum.,"[0.25030804, 2.4058464, 1.0431569, 0.030648155...","[-0.6795498, 0.35658365, 0.9994932, -0.9793934..."
1,Veil of Tears,https://m.media-amazon.com/images/M/MV5BZjMxOD...,Veil of Tears: Directed by William Gereghty. W...,\N,"Action,Crime,Drama",WF,"[0.51250213, 2.8152602, 0.46308166, 0.29031387...","[-0.6202415, 0.31657028, 0.9992422, -0.9703722..."
2,International Velvet,https://m.media-amazon.com/images/M/MV5BOGVkYj...,International Velvet: Directed by Bryan Forbes...,127,"Drama,Family,Sport",MGM,"[0.18073043, 0.24735461, 0.63652813, 0.2496522...","[-0.709996, 0.4233521, 0.99980927, -0.98892415..."
3,8 Seconds,https://m.media-amazon.com/images/M/MV5BYjY4Nz...,8 Seconds: Directed by John G. Avildsen. With ...,105,"Biography,Drama,Sport",NL,"[0.025015268, 0.9105338, 0.3878257, 0.3421247,...","[-0.7416838, 0.38435012, 0.9998453, -0.9874693..."
4,Penitentiary II,https://m.media-amazon.com/images/M/MV5BNjQyZW...,Penitentiary II: Directed by Jamaa Fanaka. Wit...,108,"Crime,Drama,Sport",MGM,"[0.19079691, 1.9068279, 0.29114372, 0.19527505...","[-0.65501904, 0.3845747, 0.9996712, -0.9766391..."


## Categorical data process

`genres` and `studio` columns

### studio column processing

The `studio` column will be transfomed as frequency of studio that appears in this dataset

In [None]:
X1_cat = X1.loc[:, ['genres', 'studio']]

In [None]:
X1_cat

In [None]:
X1_cat['studio'].value_counts()

In [None]:
X1_cat.describe()

In [None]:
# X1_cat['studio'] = reduce_cardinality(dataset=X1, column_name='studio', threshold=0.72, inplace=False)

In [None]:
# for i in range(len(X1['runtime'])):
#     try:
#         int(X1['runtime'].iloc[i].strip())
#     except:
#         X1['runtime'].iloc[i] = 0

In [None]:
s = []
for i in X1['genres']:
    for element in i.split(','):
        s.append(element)

In [None]:
set(s)

In [None]:
def catToFrequency(dataset, column_name, inplace=False):
    column = dataset[column_name]
    counts = Counter(column)

    dict_cat_freq = {}

    for category, freq in counts.most_common():
        dict_cat_freq[category] = freq / len(column)

    new_column = column.apply(lambda x: dict_cat_freq[x])

    if inplace:
        dataset[column_name] = new_column
        return
    else:
        return new_column

In [None]:
X1_cat['studio'] = catToFrequency(X1_cat, 'studio')

In [None]:
X1_cat

In [None]:
def reduce_cardinality(dataset, column_name, threshold, inplace=False):

    column = dataset[column_name]
    threshold_freq = int(threshold * len(column))
    counts = Counter(column)

    sum_freq = 0
    categories_kept = []

    for level, freq in counts.most_common():
        sum_freq += dict(counts)[level]
        categories_kept.append(level)

        if sum_freq >= threshold_freq:
            break

    categories_kept.append('Other')

    new_column = column.apply(lambda x: x if x in categories_kept else 'Other')

    if inplace:
        dataset[column_name] = new_column
    else:
        return new_column