In [1]:
import tensorflow as tf

# List available GPUs
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        # Enable dynamic memory allocation for each GPU
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print(f"{len(gpus)} GPU(s) detected and memory growth enabled:")
        for gpu in gpus:
            print(f"  - {gpu}")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print("Error setting memory growth:", e)
else:
    print("No GPUs detected. Training will run on CPU.")


1 GPU(s) detected and memory growth enabled:
  - PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')


In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout, Embedding, Flatten, Concatenate, Lambda
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import pickle
import boto3

In [3]:
df=pd.read_csv('/kaggle/input/balanced/fulltimestamps.csv',parse_dates=True,on_bad_lines='skip')

In [4]:
df = df[['asin', 'timestamp', 'rank', 'year', 'month', 'date', 'day', 
         'AUTHOR', 'PUBLISHER', 'GROUP', 'GENRE', 'TITLE']].copy()

In [5]:
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['month'] = df['month'].astype(int)
df['date'] = df['date'].astype(int)

In [6]:
# Map days to numbers
day_mapping = {"Monday": 0, "Tuesday": 1, "Wednesday": 2, 
               "Thursday": 3, "Friday": 4, "Saturday": 5, "Sunday": 6}
df['day_num'] = df['day'].map(day_mapping)

# Create cyclical features for month, date, and day
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
df['date_sin']  = np.sin(2 * np.pi * df['date'] / 31)
df['date_cos']  = np.cos(2 * np.pi * df['date'] / 31)
df['day_sin']   = np.sin(2 * np.pi * df['day_num'] / 7)
df['day_cos']   = np.cos(2 * np.pi * df['day_num'] / 7)

In [7]:
# -------------------------------
# 2. Encode Static Nominal Features
# -------------------------------
le_author = LabelEncoder()
le_publisher = LabelEncoder()
le_group = LabelEncoder()
le_genre = LabelEncoder()

df['author_encoded'] = le_author.fit_transform(df['AUTHOR'])
df['publisher_encoded'] = le_publisher.fit_transform(df['PUBLISHER'])
df['group_encoded'] = le_group.fit_transform(df['GROUP'])
df['genre_encoded'] = le_genre.fit_transform(df['GENRE'])

In [8]:
# Determine vocabulary sizes (adding 1 for embedding if needed)
author_vocab_size = df['author_encoded'].nunique() + 1
publisher_vocab_size = df['publisher_encoded'].nunique() + 1
# For group and genre, we will use one-hot encoding so we only need the number of classes
group_vocab_size = df['group_encoded'].nunique()  # no extra dimension needed for one-hot
genre_vocab_size = df['genre_encoded'].nunique()  # no extra dimension needed for one-hot


In [9]:
# -------------------------------
# 3. Process Title as a Text Feature
# -------------------------------
df['TITLE'] = df['TITLE'].astype(str)

# Create a mapping from asin to title text (assuming title is constant per book)
title_mapping = df.groupby('asin')['TITLE'].first().to_dict()

# Fit a tokenizer on unique title texts
unique_titles = list(title_mapping.values())
title_tokenizer = Tokenizer()
title_tokenizer.fit_on_texts(unique_titles)

# Define maximum title length (adjust as needed)
max_title_length = 10

# Create a mapping from asin to a padded title sequence
asin_to_title_seq = {
    asin: pad_sequences(title_tokenizer.texts_to_sequences([title]), maxlen=max_title_length)[0]
    for asin, title in title_mapping.items()
}
title_vocab_size = len(title_tokenizer.word_index) + 1

In [10]:
# -------------------------------
# 4. Create Sequences per Book (ASIN)
# -------------------------------
# Define the dynamic feature columns (time-varying features) and sequence length
dynamic_feature_columns = ['rank', 'month_sin', 'month_cos', 'date_sin', 'date_cos', 'day_sin', 'day_cos']
sequence_length = 10  # number of time steps per sequence

# Prepare lists to store sequences and static features
X_dynamic = []  # dynamic sequence input
X_author = []   # static: author
X_publisher = []  # static: publisher
X_group = []      # static: group (will be one-hot encoded later)
X_genre = []      # static: genre (will be one-hot encoded later)
X_title = []      # static: title text (tokenized and padded)
y_targets = []    # target: rank at the next time step
asin_list = []    # (optional) track book identifier


In [11]:
# Group by book (asin) and sort by timestamp
for asin, group in df.groupby('asin'):
    group = group.sort_values('timestamp')
    dyn_features = group[dynamic_feature_columns].values
    target_series = group['rank'].values
    
    # Get static features (assumed constant per book)
    author_val = group['author_encoded'].iloc[0]
    publisher_val = group['publisher_encoded'].iloc[0]
    group_val = group['group_encoded'].iloc[0]
    genre_val = group['genre_encoded'].iloc[0]
    
    # Get title sequence from asin_to_title_seq; if missing, use zeros
    if asin in asin_to_title_seq:
        title_seq = asin_to_title_seq[asin]
    else:
        title_seq = np.zeros(max_title_length, dtype=int)
    
    n = len(dyn_features)
    if n > sequence_length:
        for i in range(n - sequence_length):
            X_dynamic.append(dyn_features[i:i+sequence_length])
            y_targets.append(target_series[i + sequence_length])
            X_author.append(author_val)
            X_publisher.append(publisher_val)
            X_group.append(group_val)
            X_genre.append(genre_val)
            X_title.append(title_seq)
            asin_list.append(asin)

In [13]:
# Convert lists to NumPy arrays
X_dynamic = np.array(X_dynamic)    # (num_sequences, sequence_length, num_dynamic_features)
X_author = np.array(X_author)        # (num_sequences,)
X_publisher = np.array(X_publisher)  # (num_sequences,)
X_group = np.array(X_group)          # (num_sequences,)
X_genre = np.array(X_genre)          # (num_sequences,)
X_title = np.array(X_title)          # (num_sequences, max_title_length)
y_targets = np.array(y_targets)      # (num_sequences,)

In [14]:
print("Dynamic input shape:", X_dynamic.shape)
print("Author input shape:", X_author.shape)
print("Publisher input shape:", X_publisher.shape)
print("Group input shape:", X_group.shape)
print("Genre input shape:", X_genre.shape)
print("Title input shape:", X_title.shape)
print("Target shape:", y_targets.shape)

Dynamic input shape: (6174184, 10, 7)
Author input shape: (6174184,)
Publisher input shape: (6174184,)
Group input shape: (6174184,)
Genre input shape: (6174184,)
Title input shape: (6174184, 10)
Target shape: (6174184,)


In [15]:
# -------------------------------
# 5. Split the Data into Training and Validation Sets
# -------------------------------
(X_dyn_train, X_dyn_val,
 X_author_train, X_author_val,
 X_publisher_train, X_publisher_val,
 X_group_train, X_group_val,
 X_genre_train, X_genre_val,
 X_title_train, X_title_val,
 y_train, y_val) = train_test_split(
    X_dynamic, X_author, X_publisher, X_group, X_genre, X_title, y_targets, 
    test_size=0.2, random_state=42
)

print("Training shapes:", X_dyn_train.shape, X_author_train.shape, X_publisher_train.shape,
      X_group_train.shape, X_genre_train.shape, X_title_train.shape, y_train.shape)
print("Validation shapes:", X_dyn_val.shape, X_author_val.shape, X_publisher_val.shape,
      X_group_val.shape, X_genre_val.shape, X_title_val.shape, y_val.shape)


Training shapes: (4939347, 10, 7) (4939347,) (4939347,) (4939347,) (4939347,) (4939347, 10) (4939347,)
Validation shapes: (1234837, 10, 7) (1234837,) (1234837,) (1234837,) (1234837,) (1234837, 10) (1234837,)


In [17]:
# -------------------------------
# 6. Build the Multi-Input LSTM Model in Keras
# -------------------------------
# Dynamic input branch (time-series features)
input_dynamic = Input(shape=(sequence_length, X_dynamic.shape[2]), name="dynamic_input")
x = LSTM(64, return_sequences=False)(input_dynamic)
x = Dropout(0.2)(x)
x = Dense(32, activation='relu')(x)

# Static input branches
input_author = Input(shape=(1,), name="author_input")
input_publisher = Input(shape=(1,), name="publisher_input")
input_group = Input(shape=(1,), name="group_input")
input_genre = Input(shape=(1,), name="genre_input")
input_title = Input(shape=(max_title_length,), name="title_input")

# For author and publisher, we use embeddings as before.
embedding_author = Embedding(input_dim=author_vocab_size, output_dim=16, input_length=1)(input_author)
embedding_author = Flatten()(embedding_author)

embedding_publisher = Embedding(input_dim=publisher_vocab_size, output_dim=16, input_length=1)(input_publisher)
embedding_publisher = Flatten()(embedding_publisher)

# For group:
onehot_group = Lambda(
    lambda x: tf.one_hot(tf.cast(x, tf.int32), depth=group_vocab_size),
    output_shape=lambda input_shape: (input_shape[0], group_vocab_size)
)(input_group)
onehot_group = Flatten()(onehot_group)

# For genre:
onehot_genre = Lambda(
    lambda x: tf.one_hot(tf.cast(x, tf.int32), depth=genre_vocab_size),
    output_shape=lambda input_shape: (input_shape[0], genre_vocab_size)
)(input_genre)
onehot_genre = Flatten()(onehot_genre)

# For title, continue using an embedding layer.
embedding_title = Embedding(input_dim=title_vocab_size, output_dim=32, input_length=max_title_length)(input_title)
embedding_title = Flatten()(embedding_title)

# Concatenate all static inputs
static_concat = Concatenate()([embedding_author, embedding_publisher, onehot_group, onehot_genre, embedding_title])
static_dense = Dense(16, activation='relu')(static_concat)

# Merge dynamic and static branches
merged = Concatenate()([x, static_dense])
merged = Dense(32, activation='relu')(merged)
merged = Dropout(0.2)(merged)
output = Dense(1, activation='linear')(merged)

model = Model(inputs=[input_dynamic, input_author, input_publisher, input_group, input_genre, input_title], outputs=output)
model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])
model.summary()

In [18]:

# -------------------------------
# 7. Prepare tf.data.Dataset Generators for Training and Validation
# -------------------------------
# Define the output signature for the dataset generator
output_signature = (
    (
        tf.TensorSpec(shape=(sequence_length, X_dynamic.shape[2]), dtype=tf.float32),
        tf.TensorSpec(shape=(), dtype=tf.int32),
        tf.TensorSpec(shape=(), dtype=tf.int32),
        tf.TensorSpec(shape=(), dtype=tf.int32),
        tf.TensorSpec(shape=(), dtype=tf.int32),
        tf.TensorSpec(shape=(max_title_length,), dtype=tf.int32)
    ),
    tf.TensorSpec(shape=(), dtype=tf.float32)
)

def generator_train():
    for i in range(len(X_dyn_train)):
        yield (
            X_dyn_train[i].astype(np.float32),
            int(X_author_train[i]),
            int(X_publisher_train[i]),
            int(X_group_train[i]),
            int(X_genre_train[i]),
            X_title_train[i].astype(np.int32)
        ), np.float32(y_train[i])

def generator_val():
    for i in range(len(X_dyn_val)):
        yield (
            X_dyn_val[i].astype(np.float32),
            int(X_author_val[i]),
            int(X_publisher_val[i]),
            int(X_group_val[i]),
            int(X_genre_val[i]),
            X_title_val[i].astype(np.int32)
        ), np.float32(y_val[i])

train_dataset = tf.data.Dataset.from_generator(generator_train, output_signature=output_signature)
train_dataset = train_dataset.shuffle(buffer_size=1024).batch(256)

val_dataset = tf.data.Dataset.from_generator(generator_val, output_signature=output_signature)
val_dataset = val_dataset.batch(256)


In [19]:
# -------------------------------
# 8. Train the Model
# -------------------------------
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

checkpoint_callback = ModelCheckpoint(
    filepath='best_model.keras',  # local path for best model
    monitor='val_loss',
    save_best_only=True,
    verbose=1
)
earlystopping_callback = EarlyStopping(
    monitor='val_loss',
    patience=3,
    verbose=1,
    restore_best_weights=True
)

In [21]:
batch_size = 256
steps_per_epoch = int(np.ceil(len(X_dyn_train) / batch_size))


In [None]:
history = model.fit(
    train_dataset,
    steps_per_epoch=steps_per_epoch,
    validation_data=val_dataset,
    epochs=10,
    callbacks=[checkpoint_callback, earlystopping_callback],
    verbose=1
)



Epoch 1/10
[1m  317/19295[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m25:08[0m 79ms/step - loss: 7303732920320.0000 - mae: 1534814.6250

In [None]:
# -------------------------------
# 9. Evaluate and Predict on the Validation Set
# -------------------------------
loss, mae = model.evaluate(val_dataset)
print("Validation Loss (MSE):", loss)
print("Validation MAE:", mae)

y_pred = model.predict(val_dataset)

plt.figure(figsize=(10, 5))
# Concatenate all actual target values from the validation dataset for plotting
actual_y = np.concatenate([y for _, y in val_dataset], axis=0)
plt.plot(actual_y, label='Actual Rank')
plt.plot(y_pred.flatten(), label='Predicted Rank')
plt.xlabel("Samples")
plt.ylabel("Rank")
plt.title("Actual vs. Predicted Rank")
plt.legend()
plt.savefig('predictions_vs_actual.png')
plt.show()

In [None]:
# Optionally, save the final model again locally
model.save('final_model.keras')

# -------------------------------
# 10. Save Pickle Files for the Encoders and Tokenizer
# -------------------------------
with open('le_author.pkl', 'wb') as f:
    pickle.dump(le_author, f)
with open('le_publisher.pkl', 'wb') as f:
    pickle.dump(le_publisher, f)
with open('le_group.pkl', 'wb') as f:
    pickle.dump(le_group, f)
with open('le_genre.pkl', 'wb') as f:
    pickle.dump(le_genre, f)
with open('title_tokenizer.pkl', 'wb') as f:
    pickle.dump(title_tokenizer, f)
