In [None]:
import numpy as np 
import pandas as pd 
from sklearn import preprocessing

import lightgbm as lgb

from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import mean_squared_error
np.random.seed(42)

from sklearn.model_selection import train_test_split

pd.options.mode.chained_assignment = None


from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping


from sklearn.model_selection import KFold

import tensorflow as tf

tf.random.set_seed(1)

from tensorflow.keras.layers import Input, Embedding, SpatialDropout1D, LSTM, Dropout, Dense
from tensorflow.keras.layers import Concatenate,Flatten,Reshape, Bidirectional
from tensorflow.keras.models import Model



In [None]:
PATH = '/kaggle/input/predictingbookratingsint20h/'
data = pd.read_csv(PATH+'train.csv').drop(columns = 'id')
test = pd.read_csv(PATH+'test.csv').drop(columns = 'id')
submission = pd.read_csv(PATH+'submission.csv')

In [None]:
data.iloc[0]

In [None]:
"book_genre", "book_authors", "book_format", "book_pages", "book_review_count", "book_rating_count"

In [None]:
def split_sep(data_):
    data_.book_authors = data_.book_authors.fillna('')
    data_.book_authors = data_.book_authors.apply(lambda x: x.split('|'))
    
    data_.book_genre = data_.book_genre.fillna('')
    data_.book_genre = data_.book_genre.apply(lambda x: x.split('|'))
    
    return data_

data = split_sep(data)
test = split_sep(test)

def top5(data_):
    data_['book_genre'] = data_['book_genre'].apply(lambda x: x[:5]  )
    data_['book_authors'] = data_['book_authors'].apply(lambda x: x[0])
    
    return data_

data = top5(data)
test = top5(test)

def pp_format_pages(data_):
    
    data_["book_format"] = data_["book_format"].fillna("no_format")
    #fill nan with mean over format
    data_["book_pages"].loc[~data_["book_pages"].isnull()] = (
        data_["book_pages"].loc[~data_["book_pages"].isnull()].apply(lambda x: int(x[:-5]))
    )
    data_["book_pages"] = data_["book_pages"].astype(np.float32)
    data_["book_pages"] = data_["book_pages"].fillna(
        data_.groupby("book_format")["book_pages"].transform("mean")
    )
    data_["book_pages"] = data_["book_pages"].fillna(0)
    data_["book_pages"] = data_["book_pages"].astype(np.int32)
    
    return data_

data = pp_format_pages(data)
test = pp_format_pages(test)

In [None]:
print('test book_rating_count 5% - ', test.book_rating_count.quantile(0.05))
print('Will lose', data[data.book_rating_count <= test.book_rating_count.quantile(0.05) ].shape[0]/data.shape[0])

data = data[data.book_rating_count > test.book_rating_count.quantile(0.05)]
data.reset_index(drop=True, inplace=True)

In [None]:
print('% of common Authors in test',
    data[data.book_authors.isin(test.book_authors.unique())].shape[0]/data.shape[0])

print('% of common Authors in train',
test[test.book_authors.isin(data.book_authors.unique())].shape[0]/test.shape[0])

In [None]:
import json

In [None]:
with open('Genres.json', 'w') as fp:
    json.dump(Genres, fp)
    
with open('Authors.json', 'w') as fp:
    json.dump(Authors, fp)
    
with open('Format.json', 'w') as fp:
    json.dump(Format, fp)

In [None]:
def flat(lst):
    if isinstance(lst, list):
        for item in lst:
            yield from flat(item)
    else:        
        yield lst

Genres = set(list(flat(list(test['book_genre']))))
Genres = {i:ind+1 for ind, i in enumerate(Genres)}

Genres.update({'not_in_test': max(Genres.values())+1})


Authors = set(list(flat(list(test['book_authors']))))
Authors = {i:ind for ind, i in enumerate(Authors)}

Authors.update({'not_in_test': max(Authors.values())+1})

Format = set(list(flat(list(test['book_format']))))
Format = {i:ind for ind, i in enumerate(Format)}

Format.update({'not_in_test': max(Format.values())+1})


def replace_smth(data_, Dict):
    L = []
    if type(data_)==list:
        for i in data_:
            if i in Dict.keys():
                L.append(i)
            else:
                L.append('not_in_test')
        return L
    else:
        if data_ in Dict.keys():
            return data_
        else:
            return 'not_in_test'
    

def replace_smth_data(data__):
    data__['book_genre'] = data__['book_genre'].apply(lambda x: replace_smth(x, Genres))
    data__['book_authors'] = data__['book_authors'].apply(lambda x: replace_smth(x, Authors))
    data__['book_format'] = data__['book_format'].apply(lambda x: replace_smth(x, Format))

    data__['book_genre'] = data__['book_genre'].apply(lambda x: [Genres[i] for i in x])
    data__['book_authors'] = data__['book_authors'].apply(lambda x: Authors[x])
    data__['book_format'] = data__['book_format'].apply(lambda x: Format[x])
    
    #data__['book_genre'] = list(pad_sequences(data__['book_genre'], maxlen=5, padding='post'))


    return data__

data = replace_smth_data(data)
test = replace_smth_data(test)

In [None]:
data

In [None]:
data.book_genre.apply(lambda x: [i for i in range])

In [None]:
#data = data[data.book_authors != 'not_in_test']

In [None]:
!pip install langdetect
from langdetect import detect

from tqdm import tqdm

data = data[~data.book_desc.apply(lambda x: x[:len('http://')]=='http://') ]
data.reset_index(drop=True, inplace=True)

data['language'] = ''
for i in tqdm(range(0, data.shape[0], 100)):
    try:
        #print(i)
        data['language'].iloc[i:i+100] = data.book_desc.iloc[i:i+100].apply(lambda x: detect(x[:50]))
        
    except:
        pass
        
data = data[(data['language'] == 'en') | (data['language'] == '')]
data.reset_index(drop=True, inplace=True)

In [None]:
Scaler = preprocessing.MinMaxScaler()
data.book_rating = Scaler.fit_transform(data.book_rating.values.reshape(-1,1))
#test.book_rating = Scaler.transform(test.book_rating.values.reshape(-1,1))

Scaler_pag = preprocessing.MinMaxScaler()
data.book_pages = Scaler_pag.fit_transform(data.book_pages.values.reshape(-1,1))
test.book_pages = Scaler_pag.transform(test.book_pages.values.reshape(-1,1))

Scaler_rev = preprocessing.MinMaxScaler()
data.book_review_count = Scaler_rev.fit_transform(data.book_review_count.values.reshape(-1,1))
test.book_review_count = Scaler_rev.transform(test.book_review_count.values.reshape(-1,1))

Scaler_rat = preprocessing.MinMaxScaler()
data.book_rating_count = Scaler_rat.fit_transform(data.book_rating_count.values.reshape(-1,1))
test.book_rating_count = Scaler_rat.transform(test.book_rating_count.values.reshape(-1,1))

In [None]:
def SaveModel(filename, model, PATH=''):
    joblib_file = PATH + filename+".pkl"  
    joblib.dump(model, joblib_file)
    
def LoadModel (filename, PATH =''):
    return joblib.load(PATH + filename)

In [None]:
import joblib

In [None]:
#SaveModel('Scaler', Scaler)
#SaveModel('Scaler_pag', Scaler_pag)
#SaveModel('Scaler_rev', Scaler_rev)
#SaveModel('Scaler_rat', Scaler_rat)

In [None]:
X_genres = np.stack(data.book_genre.values)
X_authors = data.book_authors.values
X_book_format = data.book_format.values
X_book_pages = data.book_pages.values
X_book_review_count = data.book_review_count.values
X_book_rating_count = data.book_rating_count.values

y = data.book_rating.values

X_test_genres = np.stack(test.book_genre.values)
X_test_authors = test.book_authors.values
X_test_book_format = test.book_format.values
X_test_book_pages = test.book_pages.values
X_test_book_review_count = test.book_review_count.values
X_test_book_rating_count = test.book_rating_count.values

In [None]:
#Here all embed_size
max_features_authors = max(Authors.values())+1
embed_size_authors = 100

max_features_genres = max(Genres.values())+1
embed_size_genres = 50

max_features_format = max(Format.values())+1
embed_size_format = 5

print(max_features_authors, max_features_genres, max_features_format)

def lstm128():
    
    inp_gen = Input(shape=(5,), name='genres')
    inp_aut = Input(shape=(1,), name='authors')
    inp_for = Input(shape=(1,), name='format')
    inp_pag = Input(shape=(1,), name='pages')
    inp_rev = Input(shape=(1,), name='review')
    inp_rat = Input(shape=(1,), name='rating')
    
    inp = [inp_gen, inp_aut, inp_for, inp_pag, inp_rev, inp_rat]
        
    seq_gen   = Embedding(max_features_genres, embed_size_genres, name='emb_gen')(inp_gen)
    seq_aut   = Embedding(max_features_authors, embed_size_authors, name='emb_aut')(inp_aut)
    seq_for   = Embedding(max_features_format, embed_size_format, name='emb_for')(inp_for)
    
    lstm_gen  = LSTM(128, return_sequences=False )(seq_gen  )
    lstm_aut  = LSTM(128, return_sequences=False )(seq_aut )
    lstm_for  = LSTM(16, return_sequences=False )(seq_for )
            
    x = Concatenate(name="x1")([lstm_gen, lstm_aut,lstm_for, inp_pag, inp_rev, inp_rat])

    x = Dense(128, activation="relu")(x)
    x = Dropout(0.1)(x)
    x = Dense(128, activation="relu")(x)
    x = Dropout(0.1)(x)
    outp = Dense(1, activation="relu")(x)
    
    model = Model(inputs=inp, outputs=outp)
    
    model.compile(loss='mse',
                  optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
                 )
    return model

In [None]:
def early_stopping_lgb(train_ , valid_, params_, params_stopping_, columns_):

    train_dataset = lgb.Dataset( train_[columns_], label =train_.book_rating, free_raw_data=False)
    valid_dataset = lgb.Dataset( valid_[columns_], label =valid_.book_rating, free_raw_data=False)

    model = lgb.train(
        params_,
        train_dataset,
        num_boost_round = params_stopping_['num_boost_round'],
        valid_sets = (train_dataset, valid_dataset) ,
        early_stopping_rounds = params_stopping_['early_stopping_rounds'],
        verbose_eval = params_stopping_['verbose_eval'],
    )

    return model

In [None]:
for gen in range(5):
    data['gen'+str(gen)] = data.book_genre.apply(lambda x: x[gen])
    test['gen'+str(gen)] = test.book_genre.apply(lambda x: x[gen])

In [None]:
data['pred_lgbm'] = 0

for Fold, (train_index, valid_index) in enumerate(KFold(n_splits=5, random_state=42, shuffle=True).split(data)):
    
    train, valid = data.iloc[train_index], data.iloc[valid_index]
    
    print(Fold, train.shape[0], valid.shape[0])
    
    
    columns_lgb = ['book_authors', 'book_format', 'book_pages',
               'book_review_count', 'book_rating_count', 'gen0', 'gen1', 'gen2', 'gen3', 'gen4']

    params = {'metrics':'l2',
         'objective':'regression',
              'num_leaves':256
             }

    params_stopping = {
        'num_boost_round':10000,
        'early_stopping_rounds':100,
        'verbose_eval':1000}
    
    lgbm_train, lgbm_valid = train_test_split(train, random_state=42,shuffle=True, test_size=0.1)

    model = early_stopping_lgb(lgbm_train, lgbm_valid, params, params_stopping, columns_lgb)
    SaveModel('lgbm'+str(Fold), model)
    
    pred_valid = model.predict(valid[columns_lgb])
    
    data['pred_lgbm'].iloc[valid.index] =  Scaler.inverse_transform(pred_valid.reshape(-1,1)).flatten()
    

    pred = model.predict(test[columns_lgb])

    test['Fold_lgbm_'+str(Fold)] = Scaler.inverse_transform(pred.reshape(-1,1))

In [None]:
data['pred'] = 0

for Fold, (train_index, valid_index) in enumerate(KFold(n_splits=5, random_state=42, shuffle=True).split(data)):
    
    train, valid = data.iloc[train_index], data.iloc[valid_index]
    
    print(Fold, train.shape[0], valid.shape[0])
    
    
    model = lstm128()

    es = EarlyStopping( monitor='val_loss',
                       patience=10)

    reduce_lr = ReduceLROnPlateau(monitor='val_loss',
                                  factor=0.5,
                                  patience=10,
                                  min_lr=0.000001)

    model.fit([X_genres[train.index], X_authors[train.index], X_book_format[train.index],
               X_book_pages[train.index], X_book_review_count[train.index], X_book_rating_count[train.index]],
               y[train.index], batch_size=3000, epochs=100, verbose=True,
               validation_split=0.1, shuffle=True, 
               callbacks=[es,reduce_lr])
    
    model.save('keras'+str(Fold))


    pred_valid = model.predict([X_genres[valid.index], X_authors[valid.index], X_book_format[valid.index],
                  X_book_pages[valid.index], X_book_review_count[valid.index], X_book_rating_count[valid.index]])
    
    data['pred'].iloc[valid.index] =  Scaler.inverse_transform(pred_valid).flatten()
    
    
    
    pred = model.predict([X_test_genres, X_test_authors, X_test_book_format, X_test_book_pages,
                           X_test_book_review_count,X_test_book_rating_count]).flatten()

    #test['Fold_'+str(Fold)] = 0
    test['Fold_'+str(Fold)] = Scaler.inverse_transform(pred.reshape(-1,1))

In [None]:
mean_squared_error(data['pred'], Scaler.inverse_transform(y.reshape(-1,1)))**0.5

In [None]:
mean_squared_error(data['pred_lgbm'],
                   Scaler.inverse_transform(y.reshape(-1,1)))**0.5

In [None]:
#0.24227946566752415
#0.22726323228988762

In [None]:
test['pred_keras'] = test[['Fold_'+str(fold) for fold in range(Fold+1)]].mean(axis=1)
test['pred_lgbm'] = test[['Fold_lgbm_'+str(fold) for fold in range(Fold+1)]].mean(axis=1)

In [None]:
test['pred'] = (test.pred_keras+test.pred_lgbm)/2

In [None]:
submission.book_rating = test['pred']

In [None]:
submission.to_csv('same_aut.csv', index=False)