In [1]:
import numpy as np 
import pandas as pd 
from sklearn import preprocessing

from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping

from sklearn.metrics import mean_squared_error

import tensorflow as tf
np.random.seed(42)
tf.random.set_seed(1)

from tensorflow.keras.layers import Input, Embedding, SpatialDropout1D, LSTM, Dropout, Dense
from tensorflow.keras.layers import Concatenate,Flatten,Reshape, Bidirectional
from tensorflow.keras.models import Model

from sklearn.model_selection import train_test_split

In [2]:
PATH = '/kaggle/input/predictingbookratingsint20h/'
data = pd.read_csv(PATH+'train.csv').drop(columns = 'id')
test = pd.read_csv(PATH+'test.csv').drop(columns = 'id')
submission = pd.read_csv(PATH+'submission.csv')

In [3]:
def split_sep(data_):
    data_.book_authors = data_.book_authors.fillna('')
    data_.book_authors = data_.book_authors.apply(lambda x: x.split('|'))
    
    data_.book_genre = data_.book_genre.fillna('')
    data_.book_genre = data_.book_genre.apply(lambda x: x.split('|'))
    
    return data_

data = split_sep(data)
test = split_sep(test)

def top5(data_):
    data_['book_genre'] = data_['book_genre'].apply(lambda x: x[:5]  )
    data_['book_authors'] = data_['book_authors'].apply(lambda x: x[0])
    
    return data_

data = top5(data)
test = top5(test)

def pp_format_pages(data_):
    
    data_["book_format"] = data_["book_format"].fillna("no_format")
    #fill nan with mean over format
    data_["book_pages"].loc[~data_["book_pages"].isnull()] = (
        data_["book_pages"].loc[~data_["book_pages"].isnull()].apply(lambda x: int(x[:-5]))
    )
    data_["book_pages"] = data_["book_pages"].astype(np.float32)
    data_["book_pages"] = data_["book_pages"].fillna(
        data_.groupby("book_format")["book_pages"].transform("mean")
    )
    data_["book_pages"] = data_["book_pages"].fillna(0)
    data_["book_pages"] = data_["book_pages"].astype(np.int32)
    
    return data_

data = pp_format_pages(data)
test = pp_format_pages(test)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [4]:
def flat(lst):
    if isinstance(lst, list):
        for item in lst:
            yield from flat(item)
    else:        
        yield lst

Genres = set(list(flat(list(test['book_genre']))))
Genres = {i:ind+1 for ind, i in enumerate(Genres)}

Genres.update({'not_in_test': max(Genres.values())+1})


Authors = set(list(flat(list(test['book_authors']))))
Authors = {i:ind for ind, i in enumerate(Authors)}

Authors.update({'not_in_test': max(Authors.values())+1})

Format = set(list(flat(list(test['book_format']))))
Format = {i:ind for ind, i in enumerate(Format)}

Format.update({'not_in_test': max(Format.values())+1})


def replace_smth(data_, Dict):
    L = []
    if type(data_)==list:
        for i in data_:
            if i in Dict.keys():
                L.append(i)
            else:
                L.append('not_in_test')
        return L
    else:
        if data_ in Dict.keys():
            return data_
        else:
            return 'not_in_test'
    

def replace_smth_data(data__):
    data__['book_genre'] = data__['book_genre'].apply(lambda x: replace_smth(x, Genres))
    data__['book_authors'] = data__['book_authors'].apply(lambda x: replace_smth(x, Authors))
    data__['book_format'] = data__['book_format'].apply(lambda x: replace_smth(x, Format))

    data__['book_genre'] = data__['book_genre'].apply(lambda x: [Genres[i] for i in x])
    data__['book_authors'] = data__['book_authors'].apply(lambda x: Authors[x])
    data__['book_format'] = data__['book_format'].apply(lambda x: Format[x])
    
    data__['book_genre'] = list(pad_sequences(data__['book_genre'], maxlen=5, padding='post'))


    return data__

data = replace_smth_data(data)
test = replace_smth_data(test)

In [5]:
#print(data[data.book_rating_count <= test.book_rating_count.quantile(0.01) ].shape[0]/data.shape[0],
# test[test.book_rating_count <= 100 ].shape[0]/test.shape[0])

#data = data[data.book_rating_count > test.book_rating_count.quantile(0.01)]
#data.reset_index(drop=True, inplace=True)

!pip install langdetect
from langdetect import detect


data = data[~data.book_desc.apply(lambda x: x[:len('http://')]=='http://') ]
data.reset_index(drop=True, inplace=True)

data['language'] = ''
for i in range(0, data.shape[0], 100):
    try:
        print(i)
        data['language'].iloc[i:i+100] = data.book_desc.iloc[i:i+100].apply(lambda x: detect(x[:30]))
        
    except:
        try:
            for j in range(i, i+100):
                data['language'].iloc[i:i+1] = data.book_desc.iloc[i:i+1].apply(lambda x: detect(x[:30]))
        except:
            pass
        
data = data[(data['language'] == 'en') | (data['language'] == '')]
data.reset_index(drop=True, inplace=True)

In [6]:
Scaler = preprocessing.MinMaxScaler()
data.book_rating = Scaler.fit_transform(data.book_rating.values.reshape(-1,1))
#test.book_rating = Scaler.transform(test.book_rating.values.reshape(-1,1))

Scaler_pag = preprocessing.MinMaxScaler()
data.book_pages = Scaler_pag.fit_transform(data.book_pages.values.reshape(-1,1))
test.book_pages = Scaler_pag.transform(test.book_pages.values.reshape(-1,1))

Scaler_rev = preprocessing.MinMaxScaler()
data.book_review_count = Scaler_rev.fit_transform(data.book_review_count.values.reshape(-1,1))
test.book_review_count = Scaler_rev.transform(test.book_review_count.values.reshape(-1,1))

Scaler_rat = preprocessing.MinMaxScaler()
data.book_rating_count = Scaler_rat.fit_transform(data.book_rating_count.values.reshape(-1,1))
test.book_rating_count = Scaler_rat.transform(test.book_rating_count.values.reshape(-1,1))

In [7]:
train, valid = train_test_split(data, shuffle=True, random_state=42)

In [8]:
X_genres = np.stack(data.book_genre.values)
X_authors = data.book_authors.values
X_book_format = data.book_format.values
X_book_pages = data.book_pages.values
X_book_review_count = data.book_review_count.values
X_book_rating_count = data.book_rating_count.values

y = data.book_rating.values

X_test_genres = np.stack(test.book_genre.values)
X_test_authors = test.book_authors.values
X_test_book_format = test.book_format.values
X_test_book_pages = test.book_pages.values
X_test_book_review_count = test.book_review_count.values
X_test_book_rating_count = test.book_rating_count.values

In [9]:
#Here all embed_size
max_features_authors = max(Authors.values())+1
embed_size_authors = 3

max_features_genres = max(Genres.values())+1
embed_size_genres = 30

max_features_format = max(Format.values())+1
embed_size_format = 3

print(max_features_authors, max_features_genres, max_features_format)

def lstm128():
    
    inp_gen = Input(shape=(5,), name='genres')
    inp_aut = Input(shape=(1,), name='authors')
    inp_for = Input(shape=(1,), name='format')
    inp_pag = Input(shape=(1,), name='pages')
    inp_rev = Input(shape=(1,), name='review')
    inp_rat = Input(shape=(1,), name='rating')
    
    inp = [inp_gen, inp_aut, inp_for, inp_pag, inp_rev, inp_rat]
        
    seq_gen   = Embedding(max_features_genres, embed_size_genres, name='emb_gen')(inp_gen)
    seq_aut   = Embedding(max_features_authors, embed_size_authors, name='emb_aut')(inp_aut)
    seq_for   = Embedding(max_features_format, embed_size_format, name='emb_for')(inp_for)
    
    lstm_gen   = LSTM(128, return_sequences=False )(seq_gen  )
    lstm_aut  = LSTM(20,   return_sequences=False )(seq_aut )
    lstm_for  = LSTM(20,   return_sequences=False )(seq_for )
            
    x = Concatenate(name="x1")([lstm_gen, lstm_aut,lstm_for, inp_pag, inp_rev, inp_rat])

    x = Dense(128, activation="relu")(x)
    x = Dropout(0.1)(x)
    x = Dense(128, activation="relu")(x)
    x = Dropout(0.1)(x)
    outp = Dense(1, activation="relu")(x)
    
    model = Model(inputs=inp, outputs=outp)
    
    model.compile(loss='mse',
                  optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
                 )
    return model

2241 264 15


In [10]:
model = lstm128()

es = EarlyStopping( monitor='val_loss',
                   patience=30)

reduce_lr = ReduceLROnPlateau(monitor='val_loss',
                              factor=0.5,
                              patience=10,
                              min_lr=0.000001)

In [11]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
genres (InputLayer)             [(None, 5)]          0                                            
__________________________________________________________________________________________________
authors (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
format (InputLayer)             [(None, 1)]          0                                            
__________________________________________________________________________________________________
emb_gen (Embedding)             (None, 5, 30)        7920        genres[0][0]                     
______________________________________________________________________________________________

In [12]:
model.fit([X_genres[train.index], X_authors[train.index], X_book_format[train.index],
           X_book_pages[train.index], X_book_review_count[train.index], X_book_rating_count[train.index]],
           y[train.index], batch_size=1000, epochs=20, verbose=True,
           validation_split=0.1, shuffle=True, 
           callbacks=[es,reduce_lr])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7fbbdb3e7bd0>

In [14]:
pred_valid = model.predict([X_genres[valid.index], X_authors[valid.index], X_book_format[valid.index],
                      X_book_pages[valid.index], X_book_review_count[valid.index], X_book_rating_count[valid.index]])

pred_valid_inv = Scaler.inverse_transform(pred_valid)

valid['pred'] = pred_valid_inv

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [15]:
mean_squared_error((valid['pred']).round(2), Scaler.inverse_transform(y[valid.index].reshape(-1,1)))**0.5

0.3466855175011716

In [16]:
pred = model.predict([X_test_genres, X_test_authors, X_test_book_format, X_test_book_pages,
                           X_test_book_review_count,X_test_book_rating_count])

pred_inv = Scaler.inverse_transform(pred)

test['pred'] = pred_inv

In [18]:
test

Unnamed: 0,book_title,book_image_url,book_desc,book_genre,book_authors,book_format,book_pages,book_review_count,book_rating_count,pred
0,The Paper Wasp,https://i.gr-assets.com/images/S/compressed.ph...,An electrifying debut novel from the acclaimed...,"[124, 102, 174, 174, 103]",574,5,0.019557,0.002581,0.000287,3.715068
1,The Old Success,https://i.gr-assets.com/images/S/compressed.ph...,When the body of a French woman washes up on a...,"[100, 124, 246, 138, 174]",1927,5,0.016444,0.002544,0.000503,3.749581
2,The Berlin Girl,https://i.gr-assets.com/images/S/compressed.ph...,From the bestselling author of The German Midw...,"[48, 139, 124, 56, 186]",364,11,0.027069,0.001679,0.000256,4.004651
3,Bringing Down the Duke,https://i.gr-assets.com/images/S/compressed.ph...,One of Publishers Weekly's Most Anticipated Fa...,"[176, 48, 139, 176, 230]",56,5,0.024091,0.030639,0.005867,3.980087
4,Horse Crazy: Girls and the Lives of Horses,https://i.gr-assets.com/images/S/compressed.ph...,Horse Crazy explores the meaning behind the lo...,"[216, 222, 0, 0, 0]",1008,11,0.018407,0.000044,0.000002,4.020636
...,...,...,...,...,...,...,...,...,...,...
3015,The End of Everything (Astrophysically Speaking),https://i.gr-assets.com/images/S/compressed.ph...,From one of the most dynamic rising stars in a...,"[157, 216, 157, 202, 157]",2214,5,0.016241,0.002345,0.000381,4.023128
3016,Saving Ruby King,https://i.gr-assets.com/images/S/compressed.ph...,"Set in the South Side of Chicago, an epic, ent...","[124, 100, 174, 103, 76]",830,5,0.023821,0.004478,0.000546,3.710314
3017,Pretending,https://i.gr-assets.com/images/S/compressed.ph...,WHY BE YOURSELF WHEN YOU CAN BE PERFECT?'Perce...,"[124, 102, 176, 58, 199]",611,5,0.029641,0.003502,0.000915,3.736754
3018,"Hello, Summer",https://i.gr-assets.com/images/S/compressed.ph...,New York Times bestselling author and Queen of...,"[124, 58, 199, 100, 176]",193,5,0.031942,0.008994,0.002080,3.725240


In [22]:
submission.book_rating = test['pred']

In [23]:
submission.to_csv('First.csv', index=False)