# Import necessary libraries

In [1]:
import json
import numpy as np
import pandas as pd
from itertools import groupby

from py_files.writer_director_to_one_hot import writer_director_to_one_hot
from py_files.add_merge_begin_end_year import merge_start_end_year
from py_files.load_box_office_data import load_and_aggregate_box_office
from py_files.add_remake_feature import create_remake_column
from py_files.add_langoriginaltitle_feature import add_language_of_original_title
from py_files.add_ENvsNonEN_feature import add_english_title_or_not
from py_files.add_movie_genre_feature import add_movie_genre
from py_files.df_processor_enrichment import df_processor_enrichment

from py_files.df_model_prep import df_model_prep
from py_files.d2v_embed import d2v_embed
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import accuracy_score
import math

# Loading the data

In [2]:
from py_files.load_original_data import load_original_data

df_original = load_original_data()

Found files: train-1.csv, train-2.csv, train-3.csv, train-4.csv, train-5.csv, train-6.csv, train-7.csv, train-8.csv, train_df_with_features_fully_processed_read_for_model.csv


# Preprocessing of original columns

In [3]:
# copy the dataframe so we leave the original untouched
df_preprocessed = df_original.copy(deep=True)

# start the preprocessing
df_preprocessed = df_original.replace("\\N", np.nan)
df_preprocessed["primaryTitleFormatted"] = df_preprocessed["primaryTitle"].str.lower()\
                                                                          .str.normalize('NFKD')\
                                                                          .str.encode('ascii', errors='ignore')\
                                                                          .str.decode('utf-8')\
                                                                          .str.replace(" ", "_", regex=True)\
                                                                          .str.replace("\W", "", regex=True)

# merge endYear into beginYear when beginYear is not available --> rename Year
df_preprocessed = merge_start_end_year(df_preprocessed)

# set the datatypes of the dataframe correctly
df_preprocessed['Year'] = df_preprocessed['Year'].astype(int)
df_preprocessed['runtimeMinutes'] = df_preprocessed['runtimeMinutes'].astype(float)

df_preprocessed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15918 entries, 0 to 15917
Columns: 18587 entries, tconst to primaryTitleFormatted
dtypes: bool(1), float64(18577), int32(1), object(8)
memory usage: 2.2+ GB


## Preprocessing of exogenous data

### Oscar data

In [None]:
oscars = pd.read_csv("additional_data/oscars.csv")

oscars["film"] = oscars["film"].str.lower()\
                               .str.normalize('NFKD')\
                               .str.encode('ascii', errors='ignore')\
                               .str.decode('utf-8')\
                               .str.replace(" ", "_", regex=True)\
                               .str.replace("\W", "", regex=True)

# Counting oscar nominations and wins per movie
oscar_noms = pd.merge(df_preprocessed, oscars, left_on = "primaryTitleFormatted", right_on = "film").groupby("tconst")["winner"].count()
oscar_wins = pd.merge(df_preprocessed, oscars, left_on = "primaryTitleFormatted", right_on = "film").groupby("tconst")["winner"].sum()

### Writer and Director data

In [None]:
# Find writers and directors per movie and combine the two
written_and_directed = (writer_director_to_one_hot("writers") + writer_director_to_one_hot("directors")).fillna(0).astype(int).loc[df_preprocessed['tconst']]

### TMDB data

### Box Office data

In [None]:
df_box_office_mojo = load_and_aggregate_box_office()

# process the 'release group' (read movie title) in the same way as the formatted title
df_box_office_mojo["Release Group"] = df_box_office_mojo["Release Group"].str.lower()\
                                       .str.normalize('NFKD')\
                                       .str.encode('ascii', errors='ignore')\
                                       .str.decode('utf-8')\
                                       .str.replace(" ", "_", regex=True)\
                                       .str.replace("\W", "", regex=True)
df_box_office_mojo.drop(['%', '%.1'], axis=1, inplace=True)

# Adding of exogenous columns

In [None]:
df_incl_exog = df_preprocessed.copy(deep=True)
df_incl_exog = df_incl_exog.rename({"tconst" : "id"}, axis = 1).set_index("id")
df_incl_exog.info()

## add oscar data

In [None]:
df_incl_exog["oscar_noms"] = oscar_noms
df_incl_exog["oscar_wins"] = oscar_wins

## add mojo box office

In [None]:
df_incl_exog = df_incl_exog.reset_index().merge(df_box_office_mojo, left_on=['primaryTitleFormatted', 'Year'], right_on=['Release Group', 'year'], how="left").set_index('id')
df_incl_exog.drop(['Release Group', 'year'], axis=1, inplace=True)

df_incl_exog.loc[df_incl_exog['Worldwide'] == '-', 'Worldwide'] = np.nan
df_incl_exog.loc[df_incl_exog['Domestic'] == '-', 'Domestic'] = np.nan
df_incl_exog.loc[df_incl_exog['Foreign'] == '-', 'Foreign'] = np.nan
df_incl_exog.loc[df_incl_exog['Worldwide'].notnull(), 'Worldwide'] = df_incl_exog.loc[df_incl_exog['Worldwide'].notnull(), 'Worldwide'].apply(lambda x: float(x.replace('$', '').replace(',', '')))
df_incl_exog.loc[df_incl_exog['Domestic'].notnull(), 'Domestic'] = df_incl_exog.loc[df_incl_exog['Domestic'].notnull(), 'Domestic'].apply(lambda x: float(x.replace('$', '').replace(',', '')))
df_incl_exog.loc[df_incl_exog['Foreign'].notnull(), 'Foreign'] = df_incl_exog.loc[df_incl_exog['Foreign'].notnull(), 'Foreign'].apply(lambda x: float(x.replace('$', '').replace(',', '')))

## add remake column

In [None]:
df_incl_exog = create_remake_column(df_incl_exog)

## add title language

In [None]:
# # add the language of the original title, currently commented for training data usage and not wait 15 min every time
# df_incl_exog = add_language_of_original_title(df_incl_exog)

df_added_lang = pd.read_csv('additional_data/df_added_lang.csv', index_col=0)
df_added_lang = df_added_lang.rename({"tconst" : "id"}, axis = 1).set_index("id")
df_incl_exog = df_incl_exog.join(df_added_lang['title_language'], how='left')

## add whether title is English or not

In [None]:
df_incl_exog = add_english_title_or_not(df_incl_exog)

## add movie genres

In [None]:
df_incl_exog = add_movie_genre(df_incl_exog)

## add writers and directors

In [None]:
df_incl_exog = pd.concat([df_incl_exog.T, written_and_directed.T]).T

## add TMDB data

## save dataframe with features

In [None]:
df_incl_exog.to_csv('df_with_features.csv')

# Preparing data for classifier

Convert non-numeric columns to numeric.
We use Doc2Vec to embed each string column into n-by-128 array 

In [5]:
train_df = pd.read_csv('df_with_features.csv', index_col=0)

In [6]:
train_df_prepped = df_model_prep(train_df,'train')
train_df_prepped.head()

Looking for pre made file...


Unnamed: 0,startYear,endYear,runtimeMinutes,numVotes,label,Year,oscar_noms,oscar_wins,Rank,Worldwide,...,genres_118,genres_119,genres_120,genres_121,genres_122,genres_123,genres_124,genres_125,genres_126,genres_127
tt0010600,1919.0,1919.0,66.0,1898.0,True,1919,0.0,0.0,,,...,0.097997,-0.010395,0.080123,0.055377,-0.034714,0.169179,0.108478,-0.043804,0.185851,-0.032848
tt0011841,1920.0,1920.0,145.0,5376.0,True,1920,0.0,0.0,,,...,0.010011,-0.131253,-0.061501,0.061952,-0.074915,0.158705,0.120298,0.032335,-0.04131,0.004742
tt0012494,1921.0,1921.0,97.0,5842.0,True,1921,0.0,0.0,,,...,-0.000573,-0.002037,0.001061,0.002554,-0.000353,-0.001539,0.00095,0.001109,0.002473,0.002893
tt0015163,1924.0,1924.0,59.0,9652.0,True,1924,0.0,0.0,,,...,-0.002003,-0.00359,0.002809,-0.001703,-0.001013,-0.000995,0.000765,0.001049,0.003256,0.002087
tt0016220,1925.0,1925.0,93.0,17887.0,True,1925,3.0,0.0,,,...,-0.001075,-0.002395,0.002726,0.003161,-0.002507,-0.002018,-0.002822,0.000494,-0.001881,0.001507


In [None]:
# df_model_prep function for demonstration purposes
#
# from py_files.d2v_embed import d2v_embed
# import pandas as pd
# import math

# def df_model_prep(df, filename):
    
#     try:
#         print("Looking for pre made file...")
#         return pd.read_csv(f"{filename}_df_with_features_fully_processed_read_for_model.csv", index_col = 0)
#     except:
#         print("No file found, creating a new one")
    
#     prim_title_df = d2v_embed(df['primaryTitle'])
#     orig_title_df = d2v_embed(df['originalTitle'])
#     prim_title_formatted_df = d2v_embed(df['primaryTitleFormatted'])
#     title_formatted_df = d2v_embed(df['titleFormatted'])
#     genres_df = d2v_embed(df['genres'])

#     # just encode languages into ints for this column
#     df['title_language'] = pd.factorize(df['title_language'])[0]

#     df.drop(columns = df.select_dtypes(include='object').columns, inplace=True)

#     # dealing with (some) nan values
#     for index, row in df.iterrows():
#         # For missing startYear or endYear entries, insert the other, if it exists.
#         if math.isnan(row['startYear']):
#             if not math.isnan(row['endYear']):
#                 df.at[index,'startYear']=df.at[index,'endYear']
#         if math.isnan(row['endYear']):
#             if not math.isnan(row['startYear']):
#                 df.at[index,'endYear']=df.at[index,'startYear']

#         # For missing oscar_noms and oscar_wins, insert 0
#         if math.isnan(row['oscar_noms']):
#             df.at[index,'oscar_noms'] = 0
#         if math.isnan(row['oscar_wins']):
#             df.at[index,'oscar_wins'] = 0

#     df['numVotes'] = df['numVotes'].fillna(df['numVotes'].mean(skipna=True))
#     df['runtimeMinutes'] = df['runtimeMinutes'].fillna(df['runtimeMinutes'].mean(skipna=True))
    
#     df['title_language'] = pd.factorize(df['title_language'])[0]
    
#     df = df.join(prim_title_df)
#     df = df.join(orig_title_df)
#     df = df.join(prim_title_formatted_df)
#     df = df.join(title_formatted_df)
#     df = df.join(genres_df)
    
#     df.to_csv(f"{filename}_df_with_features_fully_processed_read_for_model.csv")
    
#     return df

In [None]:
# d2v_embed function for demonstration purposes
# 
# from gensim.models.doc2vec import Doc2Vec, TaggedDocument
# from nltk.tokenize import word_tokenize
# import multiprocessing as mp
# from tqdm import tqdm
# import pandas as pd
# import math

# def d2v_embed(df_col, max_epochs = 100, vec_size = 128, alpha = 0.025):
    
#     df_col = df_col.fillna(" ")
#     df_col = df_col.str.lower()\
#                    .str.normalize('NFKD')\
#                    .str.encode('ascii', errors='ignore')\
#                    .str.decode('utf-8')\
#                    .str.replace("\W", " ", regex=True)
    
#     tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(df_col)]

#     model = Doc2Vec(vector_size=vec_size,
#                     alpha=alpha, 
#                     min_alpha=0.00025,
#                     min_count=1,
#                     dm =1,
#                     workers = mp.cpu_count())
  
#     model.build_vocab(tagged_data)

#     for epoch in tqdm(range(max_epochs)):
#     #     print('iteration {0}'.format(epoch))
#         model.train(tagged_data,
#                     total_examples=model.corpus_count,
#                     epochs=model.epochs)
#         # decrease the learning rate
#         model.alpha -= 0.0002
#         # fix the learning rate, no decay
#         model.min_alpha = model.alpha
    
#     # save model
#     model.save(f"doc2vec_model_{df_col.name}.model")
    
#     #return df with doc embeddings
#     return pd.DataFrame([model.docvecs[i] for i in range(len(df_col))], 
#                         index = df_col.index,
#                         columns = [f"{df_col.name}_{i}" for i in range(vec_size)])

In [None]:
# df_processor_enrichment function for demonstration purposes
# 
# import json
# import numpy as np
# import pandas as pd
# from itertools import groupby

# from py_files.writer_director_to_one_hot import writer_director_to_one_hot
# from py_files.add_merge_begin_end_year import merge_start_end_year
# from py_files.load_box_office_data import load_and_aggregate_box_office
# from py_files.add_remake_feature import create_remake_column
# from py_files.add_langoriginaltitle_feature import add_language_of_original_title
# from py_files.add_ENvsNonEN_feature import add_english_title_or_not
# from py_files.add_movie_genre_feature import add_movie_genre

# from py_files.d2v_embed import d2v_embed
# from sklearn.model_selection import train_test_split
# import lightgbm as lgb
# from sklearn.metrics import accuracy_score
# import math

# def df_processor_enrichment(filename):
    
#     try:
#         print("Looking for pre made file...")
#         return pd.read_csv(f"{filename}_df_with_features.csv", index_col = 0)
#     except:
#         print("File not found, creating a new one..")
              
#     df_original = pd.read_csv(filename, index_col=0)
#     # df_original.head()

#     # start the preprocessing
#     df_preprocessed = df_original.replace("\\N", np.nan)
#     df_preprocessed["primaryTitleFormatted"] = df_preprocessed["primaryTitle"].str.lower()\
#                                                                               .str.normalize('NFKD')\
#                                                                               .str.encode('ascii', errors='ignore')\
#                                                                               .str.decode('utf-8')\
#                                                                               .str.replace(" ", "_", regex=True)\
#                                                                               .str.replace("\W", "", regex=True)

#     # merge endYear into beginYear when beginYear is not available --> rename Year
#     df_preprocessed = merge_start_end_year(df_preprocessed)

#     # set the datatypes of the dataframe correctly
#     df_preprocessed['Year'] = df_preprocessed['Year'].astype(int)
#     df_preprocessed['runtimeMinutes'] = df_preprocessed['runtimeMinutes'].astype(float)

#     # df_preprocessed.info()


#     oscars = pd.read_csv("additional_data/oscars.csv")

#     oscars["film"] = oscars["film"].str.lower()\
#                                    .str.normalize('NFKD')\
#                                    .str.encode('ascii', errors='ignore')\
#                                    .str.decode('utf-8')\
#                                    .str.replace(" ", "_", regex=True)\
#                                    .str.replace("\W", "", regex=True)

#     # Counting oscar nominations and wins per movie
#     oscar_noms = pd.merge(df_preprocessed, oscars, left_on = "primaryTitleFormatted", right_on = "film").groupby("tconst")["winner"].count()
#     oscar_wins = pd.merge(df_preprocessed, oscars, left_on = "primaryTitleFormatted", right_on = "film").groupby("tconst")["winner"].sum()


#     # Find writers and directors per movie and combine the two
#     written_and_directed = (writer_director_to_one_hot("writers") + writer_director_to_one_hot("directors")).fillna(0).astype(int).loc[df_preprocessed['tconst']]


#     df_box_office_mojo = load_and_aggregate_box_office()

#     # process the 'release group' (read movie title) in the same way as the formatted title
#     df_box_office_mojo["Release Group"] = df_box_office_mojo["Release Group"].str.lower()\
#                                            .str.normalize('NFKD')\
#                                            .str.encode('ascii', errors='ignore')\
#                                            .str.decode('utf-8')\
#                                            .str.replace(" ", "_", regex=True)\
#                                            .str.replace("\W", "", regex=True)
#     df_box_office_mojo.drop(['%', '%.1'], axis=1, inplace=True)


#     df_incl_exog = df_preprocessed.copy(deep=True)
#     df_incl_exog = df_incl_exog.rename({"tconst" : "id"}, axis = 1).set_index("id")
#     # df_incl_exog.info()


#     df_incl_exog["oscar_noms"] = oscar_noms
#     df_incl_exog["oscar_wins"] = oscar_wins

#     df_incl_exog = df_incl_exog.reset_index().merge(df_box_office_mojo, left_on=['primaryTitleFormatted', 'Year'], right_on=['Release Group', 'year'], how="left").set_index('id')
#     df_incl_exog.drop(['Release Group', 'year'], axis=1, inplace=True)

#     df_incl_exog.loc[df_incl_exog['Worldwide'] == '-', 'Worldwide'] = np.nan
#     df_incl_exog.loc[df_incl_exog['Domestic'] == '-', 'Domestic'] = np.nan
#     df_incl_exog.loc[df_incl_exog['Foreign'] == '-', 'Foreign'] = np.nan
#     df_incl_exog.loc[df_incl_exog['Worldwide'].notnull(), 'Worldwide'] = df_incl_exog.loc[df_incl_exog['Worldwide'].notnull(), 'Worldwide'].apply(lambda x: float(x.replace('$', '').replace(',', '')))
#     df_incl_exog.loc[df_incl_exog['Domestic'].notnull(), 'Domestic'] = df_incl_exog.loc[df_incl_exog['Domestic'].notnull(), 'Domestic'].apply(lambda x: float(x.replace('$', '').replace(',', '')))
#     df_incl_exog.loc[df_incl_exog['Foreign'].notnull(), 'Foreign'] = df_incl_exog.loc[df_incl_exog['Foreign'].notnull(), 'Foreign'].apply(lambda x: float(x.replace('$', '').replace(',', '')))


#     df_incl_exog = create_remake_column(df_incl_exog)

#     # # add the language of the original title, currently commented for training data usage and not wait 15 min every time
#     # df_incl_exog = add_language_of_original_title(df_incl_exog)

#     df_added_lang = pd.read_csv('additional_data/df_added_lang.csv', index_col=0)
#     df_added_lang = df_added_lang.rename({"tconst" : "id"}, axis = 1).set_index("id")
#     df_incl_exog = df_incl_exog.join(df_added_lang['title_language'], how='left')

#     df_incl_exog = add_english_title_or_not(df_incl_exog)
#     df_incl_exog = add_movie_genre(df_incl_exog)
#     df_incl_exog = pd.concat([df_incl_exog.T, written_and_directed.T]).T
#     df_incl_exog.to_csv(f"{filename}_df_with_features.csv")
    
#     return df_incl_exog

# Evaluating classifier

In [8]:
model_lgbm = lgb.LGBMClassifier(objective='binary',
                                learning_rate=0.01,
                                num_iterations=1000,
                                feature_fraction=0.8,
                                verbosity=1,
                                random_state=17)
model_lgbm.fit(train_df_prepped.loc[:, train_df_prepped.columns != 'label'],
              train_df_prepped['label'],
              eval_metric='logloss')



[LightGBM] [Info] Number of positive: 3990, number of negative: 3969
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 165089
[LightGBM] [Info] Number of data points in the train set: 7959, number of used features: 674
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501319 -> initscore=0.005277
[LightGBM] [Info] Start training from score 0.005277


LGBMClassifier(feature_fraction=0.8, learning_rate=0.01, num_iterations=1000,
               objective='binary', random_state=17, verbosity=1)

# Predicting

## Add and process train and valid data

In [9]:
valid_df = df_processor_enrichment('validation_hidden.csv')
valid_df.head()

Looking for pre made file...


Unnamed: 0,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,numVotes,primaryTitleFormatted,Year,oscar_noms,oscar_wins,...,nm9925241,nm9933959,nm9942830,nm9946633,nm9955258,nm9958352,nm9958353,nm9980769,nm9985316,nm9985837
tt0003740,Cabiria,,1914.0,,148.0,3452.0,cabiria,1914,,,...,0,0,0,0,0,0,0,0,0,0
tt0008663,A Man There Was,Terje Vigen,1917.0,,65.0,1882.0,a_man_there_was,1917,,,...,0,0,0,0,0,0,0,0,0,0
tt0010307,J'accuse!,,1919.0,,166.0,1692.0,jaccuse,1919,,,...,0,0,0,0,0,0,0,0,0,0
tt0014429,Safety Last!,Safety Last!,1923.0,,74.0,19898.0,safety_last,1923,,,...,0,0,0,0,0,0,0,0,0,0
tt0015175,Die Nibelungen: Siegfried,,1924.0,,143.0,5676.0,die_nibelungen_siegfried,1924,,,...,0,0,0,0,0,0,0,0,0,0


In [10]:
valid_df_prepped = df_model_prep(valid_df, 'valid')
valid_df_prepped.head()

Looking for pre made file...


Unnamed: 0,startYear,endYear,runtimeMinutes,numVotes,Year,oscar_noms,oscar_wins,Rank,Worldwide,Domestic,...,genres_118,genres_119,genres_120,genres_121,genres_122,genres_123,genres_124,genres_125,genres_126,genres_127
tt0003740,1914.0,1914.0,148.0,3452.0,1914,0.0,0.0,,,,...,0.094835,-0.324148,0.14286,0.071241,0.081461,0.219238,-0.273879,0.077414,0.011232,0.020276
tt0008663,1917.0,1917.0,65.0,1882.0,1917,0.0,0.0,,,,...,-0.00199,0.000379,-0.001504,-0.001444,-0.00088,-0.000938,-0.002971,0.002663,-0.002288,-0.000535
tt0010307,1919.0,1919.0,166.0,1692.0,1919,0.0,0.0,,,,...,-0.000573,-0.002037,0.001061,0.002554,-0.000353,-0.001539,0.00095,0.001109,0.002473,0.002893
tt0014429,1923.0,1923.0,74.0,19898.0,1923,0.0,0.0,,,,...,-0.033117,-0.087439,0.155141,-0.094323,0.032634,0.25148,0.002437,0.122426,0.144368,0.044142
tt0015175,1924.0,1924.0,143.0,5676.0,1924,0.0,0.0,,,,...,-0.001075,-0.002395,0.002726,0.003161,-0.002507,-0.002018,-0.002822,0.000494,-0.001881,0.001507


In [13]:
test_df = df_processor_enrichment('test_hidden.csv')
test_df.head()

Looking for pre made file...


Unnamed: 0,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,numVotes,primaryTitleFormatted,Year,oscar_noms,oscar_wins,...,nm9925241,nm9933959,nm9942830,nm9946633,nm9955258,nm9958352,nm9958353,nm9980769,nm9985316,nm9985837
tt0014972,He Who Gets Slapped,He Who Gets Slapped,1924.0,,95.0,3654.0,he_who_gets_slapped,1924,,,...,0,0,0,0,0,0,0,0,0,0
tt0015016,The Iron Horse,,1924.0,,150.0,2136.0,the_iron_horse,1924,,,...,0,0,0,0,0,0,0,0,0,0
tt0015174,Die Nibelungen: Kriemhild's Revenge,,1924.0,,129.0,4341.0,die_nibelungen_kriemhilds_revenge,1924,,,...,0,0,0,0,0,0,0,0,0,0
tt0015214,At 3:25,,,1925.0,59.0,1724.0,at_325,1925,,,...,0,0,0,0,0,0,0,0,0,0
tt0015863,Go West,,1925.0,,69.0,4188.0,go_west,1925,,,...,0,0,0,0,0,0,0,0,0,0


In [14]:
test_df_prepped = df_model_prep(test_df, 'test')
test_df_prepped.head()

Looking for pre made file...


Unnamed: 0,startYear,endYear,runtimeMinutes,numVotes,Year,oscar_noms,oscar_wins,Rank,Worldwide,Domestic,...,genres_118,genres_119,genres_120,genres_121,genres_122,genres_123,genres_124,genres_125,genres_126,genres_127
tt0014972,1924.0,1924.0,95.0,3654.0,1924,0.0,0.0,,,,...,-0.059321,-0.193406,-0.012012,0.124108,-0.060414,-0.028046,0.014387,0.118568,-0.020169,-0.049739
tt0015016,1924.0,1924.0,150.0,2136.0,1924,0.0,0.0,,,,...,-0.00199,0.000379,-0.001504,-0.001444,-0.00088,-0.000938,-0.002971,0.002663,-0.002288,-0.000535
tt0015174,1924.0,1924.0,129.0,4341.0,1924,0.0,0.0,,,,...,-0.000573,-0.002037,0.001061,0.002554,-0.000353,-0.001539,0.00095,0.001109,0.002473,0.002893
tt0015214,1925.0,1925.0,59.0,1724.0,1925,0.0,0.0,,,,...,-0.002003,-0.00359,0.002809,-0.001703,-0.001013,-0.000995,0.000765,0.001049,0.003256,0.002087
tt0015863,1925.0,1925.0,69.0,4188.0,1925,0.0,0.0,,,,...,-0.028519,-0.35388,0.054671,0.065618,-0.093752,0.105984,-0.013684,0.151898,0.009653,-0.141698


In [15]:
val_preds_lgbm = model_lgbm.predict(valid_df_prepped)
test_preds_lgbm = model_lgbm.predict(test_df_prepped)

In [17]:
with open('val_preds_lgbm.txt', 'w+') as f:
    for val in val_preds_lgbm:
        f.write(f"{str(val)}\n")

In [18]:
with open('test_preds_lgbm.txt', 'w+') as f:
    for val in test_preds_lgbm:
        f.write(f"{str(val)}\n")