In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction import DictVectorizer

def convert_list_of_dict_str_to_list_of_dict(x):
    if isinstance(x, str):
        return eval(x)
    else:
        return []

# multilingual_model = SentenceTransformer('paraphrase-xlm-r-multilingual-v1')
multilingual_model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')

df = pd.read_csv('data/test.csv')

In [2]:
import pickle

In [3]:
# load the vectorizer from file
with open('status_ohe_short.pkl', 'rb') as f:
    status_ohe = pickle.load(f)

with open('original_language_ohe_short.pkl', 'rb') as f:
    original_language_ohe = pickle.load(f)

with open('genres_vectorizer_short.pkl', 'rb') as f:
    genres_vectorizer = pickle.load(f)

In [4]:
def convert(column):
    df[column] = df[column].apply(convert_list_of_dict_str_to_list_of_dict)
    
    # for i in df[column].iloc[:15]:
    #     print(i, len(i))

In [5]:
from collections import Counter
from sklearn.preprocessing import OneHotEncoder


In [6]:
from sklearn.feature_extraction import FeatureHasher

In [7]:
import holidays
from datetime import timedelta
def check_if_is_holiday(dt, country_code):
    # if country_code == 'IN' and dt.year <= 2001:
    #     return False
    return dt.date() in holidays.CountryHoliday(country_code)

In [8]:
from functools import partial


In [9]:
country_codes = ['CN', 'US', 'KR', 'IN', 'GB', 'FR', 'DE', 'ES', 'RU', 'AU', 'CA', 'MX', 'BR']


In [10]:
df['belongs_to_collection'] = df['belongs_to_collection'].apply(convert_list_of_dict_str_to_list_of_dict)
collections = [d[0]['name'] if d else '' for d in df['belongs_to_collection'].tolist()]
embeddings = multilingual_model.encode(collections)


In [11]:
# create a empty numpy array
features = np.empty((len(df), 0))

In [12]:
features.shape

(4398, 0)

In [13]:
features = np.concatenate((features, embeddings), axis=1)
features.shape

(4398, 768)

In [14]:
df['budget'].values.shape

(4398,)

In [15]:
features = np.concatenate((features, df['budget'].values.reshape(-1, 1)), axis=1)
features.shape

(4398, 769)

In [16]:
df['genres'] = df['genres'].apply(convert_list_of_dict_str_to_list_of_dict)
genres = [Counter([o['id'] for o in d])  for d in df['genres']]

genres_vectors = genres_vectorizer.transform(genres).toarray()

In [17]:
genres_vectors.shape

(4398, 20)

In [18]:
features = np.concatenate((features, genres_vectors), axis=1)
features.shape

(4398, 789)

In [19]:
original_language_ohe.handle_unknown = 'infrequent_if_exist'

In [20]:
original_language = df['original_language'].values
original_language = original_language.reshape(-1, 1)

original_language_ohe.transform(original_language)

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [21]:
features = np.concatenate((features, original_language_ohe.transform(original_language)), axis=1)
features.shape

(4398, 802)

In [22]:
embeddings = multilingual_model.encode(df['original_title'].values)
embeddings.shape

(4398, 768)

In [23]:
features = np.concatenate((features, embeddings), axis=1)
features.shape

(4398, 1570)

In [24]:
df['overview'] = df['overview'].fillna('')

In [25]:
embeddings = multilingual_model.encode(df['overview'].values)
embeddings.shape

(4398, 768)

In [26]:
features = np.concatenate((features, embeddings), axis=1)
features.shape

(4398, 2338)

In [27]:
features = np.concatenate((features, df['popularity'].values.reshape(-1, 1)), axis=1)
features.shape

(4398, 2339)

In [28]:
convert('production_companies')
production_companies = [[str(o['name']) for o in d]  for d in df['production_companies'].tolist()]
feature_hasher = FeatureHasher(n_features=80, input_type="string")
production_companies_features = feature_hasher.transform(production_companies)  # can be id
production_companies_features.shape

(4398, 80)

In [29]:
features = np.concatenate((features, production_companies_features.toarray()), axis=1)
features.shape

(4398, 2419)

In [30]:
convert('production_countries')
production_companies = [[str(o['iso_3166_1']) for o in d]  for d in df['production_countries'].tolist()]
feature_hasher = FeatureHasher(n_features=80, input_type="string")
production_companies_features = feature_hasher.transform(production_companies)  # can be id
production_companies_features.shape

(4398, 80)

In [31]:
features = np.concatenate((features, production_companies_features.toarray()), axis=1)
features.shape

(4398, 2499)

In [32]:
# change date into day, month, year, weekday
df['release_date'] = pd.to_datetime(df['release_date'], format='%m/%d/%y')
# fill na
df['release_date'] = df['release_date'].fillna(df['release_date'].mean())

df["release_date"] = df["release_date"].apply(lambda x: x - timedelta(days=365*100) if x.year > 2019 else x)

df['release_day'] = df['release_date'].dt.day
df['release_month'] = df['release_date'].dt.month
df['release_year'] = df['release_date'].dt.year
df['release_weekday'] = df['release_date'].dt.weekday



In [33]:
release_day_features = df[['release_day', 'release_month', 'release_year', 'release_weekday']].values

In [34]:
release_day_features.shape

(4398, 4)

In [35]:
for country_code in country_codes:
    check_if_is_holiday_in_country = partial(check_if_is_holiday, country_code=country_code)
    release_day_features = np.concatenate((release_day_features, df['release_date'].apply(check_if_is_holiday_in_country).values.reshape(-1, 1)), axis=1)



In [36]:
release_day_features.shape

(4398, 17)

In [37]:
features = np.concatenate((features, release_day_features), axis=1)
features.shape

(4398, 2516)

In [38]:
df['runtime'].fillna(df['runtime'].mean(), inplace=True)

In [39]:
features = np.concatenate((features, df['runtime'].values.reshape(-1, 1)), axis=1)
features.shape

(4398, 2517)

In [40]:
column = 'spoken_languages'

convert(column)
production_companies = [[str(o['iso_639_1']) for o in d]  for d in df[column].tolist()]
feature_hasher = FeatureHasher(n_features=50, input_type="string")
production_companies_features = feature_hasher.transform(production_companies)  # can be id
production_companies_features.shape

(4398, 50)

In [41]:
features = np.concatenate((features, production_companies_features.toarray()), axis=1)
features.shape

(4398, 2567)

In [42]:
status = df['status'].values
status = status.reshape(-1, 1)
status_ohe.handle_unknown = 'infrequent_if_exist'
status_ohe.transform(status)

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]])

In [43]:
features = np.concatenate((features, status_ohe.transform(status)), axis=1)
features.shape

(4398, 2569)

In [44]:
df['tagline'].fillna('', inplace=True)

In [45]:
features = np.concatenate((features, multilingual_model.encode(df['tagline'].values)), axis=1)
features.shape

(4398, 3337)

In [46]:
# check if contain nan
df['title'].isna().sum()

3

In [47]:
features = np.concatenate((features, multilingual_model.encode(df['title'].values)), axis=1)
features.shape

(4398, 4105)

In [48]:
# embed the keywords and sum them up
column = 'Keywords'
convert(column)
embeddings = []
keywords = ([str(o['name']) for o in d]  for d in df[column].tolist())
for keyword in keywords:
    embeddings.append(multilingual_model.encode(keyword).sum(axis=0))

In [49]:
embeddings[0].shape

(768,)

In [50]:
l = 768

In [51]:
for i, e in enumerate(embeddings):
    if len(e.shape) == 0:
        embeddings[i] = np.zeros(l)

In [52]:
embeddings = np.stack(embeddings)
embeddings.shape

(4398, 768)

In [53]:
features = np.concatenate((features, embeddings), axis=1)
features.shape

(4398, 4873)

In [54]:
column = 'cast'

convert(column)
production_companies = [[str(o['name']) for o in d]  for d in df[column].tolist()]
feature_hasher = FeatureHasher(n_features=120, input_type="string")
production_companies_features = feature_hasher.transform(production_companies)
production_companies_features.shape


(4398, 120)

In [55]:
features = np.concatenate((features, production_companies_features.toarray()), axis=1)
features.shape

(4398, 4993)

In [56]:
column = 'crew'

convert(column)
production_companies = [[str(o['name'])+'|'+str(o['job'])+'|'+str(o['department']) for o in d]  for d in df[column].tolist()]
feature_hasher = FeatureHasher(n_features=120, input_type="string")
production_companies_features = feature_hasher.transform(production_companies)
production_companies_features.shape

(4398, 120)

In [57]:
features = np.concatenate((features, production_companies_features.toarray()), axis=1)
features.shape

(4398, 5113)

In [58]:
# save features
np.save('evaluation_features_short.npy', features)

In [59]:
0/0

ZeroDivisionError: division by zero

In [None]:
# load model
regr = pickle.load(open('finalized_model.sav', 'rb'))

In [None]:
regr

In [None]:
# import mean_squared_log_error
from sklearn.metrics import mean_squared_log_error

def rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

In [None]:
# check the rmsle for the test set
y_pred = regr.predict(features)

NameError: name 'y' is not defined

In [None]:
result = np.concatenate( (df[['id']].values, y_pred.reshape(-1, 1)), axis=1)

In [None]:
result

array([[3.00100000e+03, 1.99192289e+07],
       [3.00200000e+03, 8.19730074e+06],
       [3.00300000e+03, 1.41664128e+07],
       ...,
       [7.39600000e+03, 4.66179077e+07],
       [7.39700000e+03, 2.81350491e+07],
       [7.39800000e+03, 1.53086016e+07]])

In [None]:
# save result to csv with headeer id, revenue
np.savetxt('result.csv', result, delimiter=',', header='id,revenue', comments='', fmt='%d')

: 

In [None]:
0/0

ZeroDivisionError: division by zero

In [None]:
# save features
np.save('features.npy', features)

In [None]:
# import mean_squared_log_error
from sklearn.metrics import mean_squared_log_error

In [None]:
df['revenue'].isna().sum()

In [None]:
# check nan in features
np.isnan(features).sum()

In [None]:
# locate nan
np.argwhere(np.isnan(features))

In [None]:
# use features for revenue prediction with xgboost
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X_train, X_test, y_train, y_test = train_test_split(features, df['revenue'].values, test_size=0.2, random_state=42)

xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 1000)

xg_reg.fit(X_train,y_train)

preds = xg_reg.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, preds))

print("RMSE: %f" % (rmse))


In [None]:
# get the root mean squared logarithmic error
def rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

In [None]:
# use random forest for revenue prediction
from sklearn.ensemble import RandomForestRegressor

regr = RandomForestRegressor(max_depth=2, random_state=0, n_estimators=100)

regr.fit(X_train, y_train)

preds = regr.predict(X_test)

rmsle(y_test, preds)

Vectorizers

In [None]:
status_ohe

In [None]:
original_language_ohe

In [None]:
genres_vectorizer

In [None]:
import pickle

In [None]:
# save the vectorizers
with open('status_ohe.pkl', 'wb') as f:
    pickle.dump(status_ohe, f)

with open('original_language_ohe.pkl', 'wb') as f:
    pickle.dump(original_language_ohe, f)

with open('genres_vectorizer.pkl', 'wb') as f:
    pickle.dump(genres_vectorizer, f)


In [None]:
df