In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction import DictVectorizer

def convert_list_of_dict_str_to_list_of_dict(x):
    if isinstance(x, str):
        return eval(x)
    else:
        return []

# multilingual_model = SentenceTransformer('paraphrase-xlm-r-multilingual-v1')
multilingual_model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')

df = pd.read_csv('data/train.csv')

In [2]:
def convert(column):
    df[column] = df[column].apply(convert_list_of_dict_str_to_list_of_dict)
    
    # for i in df[column].iloc[:15]:
    #     print(i, len(i))

In [3]:
from collections import Counter
from sklearn.preprocessing import OneHotEncoder


In [4]:
from sklearn.feature_extraction import FeatureHasher

In [5]:
import holidays
from datetime import timedelta
def check_if_is_holiday(dt, country_code):
    # if country_code == 'IN' and dt.year <= 2001:
    #     return False
    return dt.date() in holidays.CountryHoliday(country_code)

In [6]:
from functools import partial


In [7]:
country_codes = ['CN', 'US', 'KR', 'IN', 'GB', 'FR', 'DE', 'ES', 'RU', 'AU', 'CA', 'MX', 'BR']


In [8]:
df['belongs_to_collection'] = df['belongs_to_collection'].apply(convert_list_of_dict_str_to_list_of_dict)
collections = [d[0]['name'] if d else '' for d in df['belongs_to_collection'].tolist()]
embeddings = multilingual_model.encode(collections)


In [9]:
# create a empty numpy array
features = np.empty((len(df), 0))

In [10]:
features.shape

(3000, 0)

In [11]:
features = np.concatenate((features, embeddings), axis=1)
features.shape

(3000, 768)

In [12]:
df['budget'].values.shape

(3000,)

In [13]:
features = np.concatenate((features, df['budget'].values.reshape(-1, 1)), axis=1)
features.shape

(3000, 769)

In [14]:
df['genres'] = df['genres'].apply(convert_list_of_dict_str_to_list_of_dict)
genres = [Counter([o['id'] for o in d])  for d in df['genres']]
genres_vectorizer = DictVectorizer(dtype=int)

genres_vectorizer.fit(genres)

genres_vectors = genres_vectorizer.transform(genres).toarray()

In [15]:
genres_vectors.shape

(3000, 20)

In [16]:
features = np.concatenate((features, genres_vectors), axis=1)
features.shape

(3000, 789)

In [17]:
original_language = df['original_language'].values
original_language = original_language.reshape(-1, 1)

original_language_ohe = OneHotEncoder(min_frequency=.005, sparse_output=False, handle_unknown='infrequent_if_exist').fit(original_language)
original_language_ohe.transform(original_language)

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.]])

In [18]:
features = np.concatenate((features, original_language_ohe.transform(original_language)), axis=1)
features.shape

(3000, 802)

In [19]:
embeddings = multilingual_model.encode(df['original_title'].values)
embeddings.shape

(3000, 768)

In [20]:
features = np.concatenate((features, embeddings), axis=1)
features.shape

(3000, 1570)

In [21]:
df['overview'] = df['overview'].fillna('')

In [22]:
embeddings = multilingual_model.encode(df['overview'].values)
embeddings.shape

(3000, 768)

In [23]:
features = np.concatenate((features, embeddings), axis=1)
features.shape

(3000, 2338)

In [24]:
features = np.concatenate((features, df['popularity'].values.reshape(-1, 1)), axis=1)
features.shape

(3000, 2339)

In [25]:
convert('production_companies')
production_companies = [[str(o['name']) for o in d]  for d in df['production_companies'].tolist()]
feature_hasher = FeatureHasher(n_features=80, input_type="string")
production_companies_features = feature_hasher.transform(production_companies)  # can be id
production_companies_features.shape

(3000, 80)

In [26]:
features = np.concatenate((features, production_companies_features.toarray()), axis=1)
features.shape

(3000, 2419)

In [27]:
convert('production_countries')
production_companies = [[str(o['iso_3166_1']) for o in d]  for d in df['production_countries'].tolist()]
feature_hasher = FeatureHasher(n_features=80, input_type="string")
production_companies_features = feature_hasher.transform(production_companies)  # can be id
production_companies_features.shape

(3000, 80)

In [28]:
features = np.concatenate((features, production_companies_features.toarray()), axis=1)
features.shape

(3000, 2499)

In [29]:
from dateutil.relativedelta import relativedelta

In [30]:
# change date into day, month, year, weekday
df['release_date'] = pd.to_datetime(df['release_date'], format='%m/%d/%y')
df["release_date"] = df["release_date"].apply(lambda x: x - relativedelta(years=100) if x.year > 2019 else x)

df['release_day'] = df['release_date'].dt.day
df['release_month'] = df['release_date'].dt.month
df['release_year'] = df['release_date'].dt.year
df['release_weekday'] = df['release_date'].dt.weekday



In [31]:
release_day_features = df[['release_day', 'release_month', 'release_year', 'release_weekday']].values

In [32]:
release_day_features.shape

(3000, 4)

In [33]:
for country_code in country_codes:
    check_if_is_holiday_in_country = partial(check_if_is_holiday, country_code=country_code)
    release_day_features = np.concatenate((release_day_features, df['release_date'].apply(check_if_is_holiday_in_country).values.reshape(-1, 1)), axis=1)



In [34]:
release_day_features.shape

(3000, 17)

In [35]:
features = np.concatenate((features, release_day_features), axis=1)
features.shape

(3000, 2516)

In [36]:
df['runtime'].fillna(df['runtime'].mean(), inplace=True)

In [37]:
features = np.concatenate((features, df['runtime'].values.reshape(-1, 1)), axis=1)
features.shape

(3000, 2517)

In [38]:
column = 'spoken_languages'

convert(column)
production_companies = [[str(o['iso_639_1']) for o in d]  for d in df[column].tolist()]
feature_hasher = FeatureHasher(n_features=50, input_type="string")
production_companies_features = feature_hasher.transform(production_companies)  # can be id
production_companies_features.shape

(3000, 50)

In [39]:
features = np.concatenate((features, production_companies_features.toarray()), axis=1)
features.shape

(3000, 2567)

In [40]:
status = df['status'].values
status = status.reshape(-1, 1)

status_ohe = OneHotEncoder(min_frequency=.005, sparse_output=False, handle_unknown='infrequent_if_exist').fit(status)
status_ohe.transform(status)

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]])

In [41]:
features = np.concatenate((features, status_ohe.transform(status)), axis=1)
features.shape

(3000, 2569)

In [42]:
df['tagline'].fillna('', inplace=True)

In [43]:
features = np.concatenate((features, multilingual_model.encode(df['tagline'].values)), axis=1)
features.shape

(3000, 3337)

In [44]:
# check if contain nan
df['title'].isna().sum()

0

In [45]:
features = np.concatenate((features, multilingual_model.encode(df['title'].values)), axis=1)
features.shape

(3000, 4105)

In [46]:
# embed the keywords and sum them up
column = 'Keywords'
convert(column)
embeddings = []
keywords = ([str(o['name']) for o in d]  for d in df[column].tolist())
for keyword in keywords:
    embeddings.append(multilingual_model.encode(keyword).sum(axis=0))

In [47]:
embeddings[0].shape

(768,)

In [48]:
l = 768

In [49]:
for i, e in enumerate(embeddings):
    if len(e.shape) == 0:
        embeddings[i] = np.zeros(l)

In [50]:
embeddings = np.stack(embeddings)
embeddings.shape

(3000, 768)

In [51]:
features = np.concatenate((features, embeddings), axis=1)
features.shape

(3000, 4873)

In [52]:
column = 'cast'

convert(column)
production_companies = [[str(o['name']) for o in d]  for d in df[column].tolist()]
feature_hasher = FeatureHasher(n_features=120, input_type="string")
production_companies_features = feature_hasher.transform(production_companies)
production_companies_features.shape


(3000, 120)

In [53]:
features = np.concatenate((features, production_companies_features.toarray()), axis=1)
features.shape

(3000, 4993)

In [54]:
column = 'crew'

convert(column)
production_companies = [[str(o['name'])+'|'+str(o['job'])+'|'+str(o['department']) for o in d]  for d in df[column].tolist()]
feature_hasher = FeatureHasher(n_features=120, input_type="string")
production_companies_features = feature_hasher.transform(production_companies)
production_companies_features.shape

(3000, 120)

In [55]:
features = np.concatenate((features, production_companies_features.toarray()), axis=1)
features.shape

(3000, 5113)

In [56]:
# save features
np.save('features_short_new.npy', features)

In [56]:
0/0

ZeroDivisionError: division by zero

In [None]:
# import mean_squared_log_error
from sklearn.metrics import mean_squared_log_error

In [None]:
df['revenue'].isna().sum()

0

In [None]:
# check nan in features
np.isnan(features).sum()

0

In [None]:
# locate nan
np.argwhere(np.isnan(features))

array([], shape=(0, 2), dtype=int64)

In [None]:
# use features for revenue prediction with xgboost
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X_train, X_test, y_train, y_test = train_test_split(features, df['revenue'].values, test_size=0.2, random_state=42)

xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 1000)

xg_reg.fit(X_train,y_train)

preds = xg_reg.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, preds))

print("RMSE: %f" % (rmse))


: 

: 

In [None]:
# get the root mean squared logarithmic error
def rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

In [None]:
# use random forest for revenue prediction
from sklearn.ensemble import RandomForestRegressor

regr = RandomForestRegressor(max_depth=2, random_state=0, n_estimators=100)

regr.fit(X_train, y_train)

preds = regr.predict(X_test)

rmsle(y_test, preds)

3.0697147322334315

Vectorizers

In [None]:
status_ohe

In [None]:
original_language_ohe

NameError: name 'ohe' is not defined

In [None]:
genres_vectorizer

In [58]:
import pickle

In [59]:
# save the vectorizers
with open('status_ohe_short.pkl', 'wb') as f:
    pickle.dump(status_ohe, f)

with open('original_language_ohe_short.pkl', 'wb') as f:
    pickle.dump(original_language_ohe, f)

with open('genres_vectorizer_short.pkl', 'wb') as f:
    pickle.dump(genres_vectorizer, f)


In [None]:
df

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
0,1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,"[{'id': 35, 'name': 'Comedy'}]",,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,...,2/20/15,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"[{'id': 4379, 'name': 'time travel'}, {'id': 9...","[{'cast_id': 4, 'character': 'Lou', 'credit_id...","[{'credit_id': '59ac067c92514107af02c8c8', 'de...",12314651
1,2,"[{'id': 107674, 'name': 'The Princess Diaries ...",40000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,tt0368933,en,The Princess Diaries 2: Royal Engagement,Mia Thermopolis is now a college graduate and ...,8.248895,...,8/6/04,113.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,It can take a lifetime to find true love; she'...,The Princess Diaries 2: Royal Engagement,"[{'id': 2505, 'name': 'coronation'}, {'id': 42...","[{'cast_id': 1, 'character': 'Mia Thermopolis'...","[{'credit_id': '52fe43fe9251416c7502563d', 'de...",95149435
2,3,,3300000,"[{'id': 18, 'name': 'Drama'}]",http://sonyclassics.com/whiplash/,tt2582802,en,Whiplash,"Under the direction of a ruthless instructor, ...",64.299990,...,10/10/14,105.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The road to greatness can take you to the edge.,Whiplash,"[{'id': 1416, 'name': 'jazz'}, {'id': 1523, 'n...","[{'cast_id': 5, 'character': 'Andrew Neimann',...","[{'credit_id': '54d5356ec3a3683ba0000039', 'de...",13092000
3,4,,1200000,"[{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n...",http://kahaanithefilm.com/,tt1821480,hi,Kahaani,Vidya Bagchi (Vidya Balan) arrives in Kolkata ...,3.174936,...,3/9/12,122.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,,Kahaani,"[{'id': 10092, 'name': 'mystery'}, {'id': 1054...","[{'cast_id': 1, 'character': 'Vidya Bagchi', '...","[{'credit_id': '52fe48779251416c9108d6eb', 'de...",16000000
4,5,,0,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",,tt1380152,ko,마린보이,Marine Boy is the story of a former national s...,1.148070,...,2/5/09,118.0,"[{'iso_639_1': 'ko', 'name': '한국어/조선말'}]",Released,,Marine Boy,,"[{'cast_id': 3, 'character': 'Chun-soo', 'cred...","[{'credit_id': '52fe464b9251416c75073b43', 'de...",3923970
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,2996,,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",,tt0109403,en,Chasers,Military men Rock Reilly and Eddie Devane are ...,9.853270,...,4/22/94,102.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,It was supposed to be a routine prisoner trans...,Chasers,"[{'id': 378, 'name': 'prison'}, {'id': 572, 'n...","[{'cast_id': 2, 'character': 'Rock Reilly', 'c...","[{'credit_id': '52fe4494c3a368484e02ac7d', 'de...",1596687
2996,2997,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 10402, 'n...",,tt2364975,sv,Vi är bäst!,Three girls in 1980s Stockholm decide to form ...,3.727996,...,3/28/13,102.0,"[{'iso_639_1': 'sv', 'name': 'svenska'}]",Released,,We Are the Best!,"[{'id': 1192, 'name': 'sweden'}, {'id': 4470, ...","[{'cast_id': 5, 'character': 'Bobo', 'credit_i...","[{'credit_id': '5716b72ac3a3686678012c84', 'de...",180590
2997,2998,,65000000,"[{'id': 80, 'name': 'Crime'}, {'id': 28, 'name...",,tt0116908,en,The Long Kiss Goodnight,"Samantha Caine, suburban homemaker, is the ide...",14.482345,...,10/11/96,120.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,What's forgotten is not always gone.,The Long Kiss Goodnight,"[{'id': 441, 'name': 'assassination'}, {'id': ...","[{'cast_id': 10, 'character': 'Samantha Caine ...","[{'credit_id': '52fe443a9251416c7502d579', 'de...",89456761
2998,2999,,42000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",http://www.alongcamepolly.com/,tt0343135,en,Along Came Polly,Reuben Feffer is a guy who's spent his entire ...,15.725542,...,1/16/04,90.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"For the most cautious man on Earth, life is ab...",Along Came Polly,"[{'id': 966, 'name': 'beach'}, {'id': 2676, 'n...","[{'cast_id': 8, 'character': 'Reuben Feffer', ...","[{'credit_id': '556f817b9251410866000a63', 'de...",171963386
