In [147]:
import tensorflow as tf

from tensorflow.keras.layers import Input,Embedding,Flatten, Dense,Concatenate
from tensorflow.keras.models import Model

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt


In [148]:
data = pd.read_parquet('data_preprocessed.parquet').drop(['main_author_encoded'], axis=1) #dropping main_author_encoded as it did not lead to any improvment in performance
data = data[data.Impact != 0]
X = data.drop('Impact', axis=1)
y = data['Impact']

In [149]:
categorical_columns  = ['publisher_encoded','categories_encoded']
numerical_columns = list(set(X.columns) - set(categorical_columns))

In [150]:
scaler = StandardScaler()
X[numerical_columns] = scaler.fit_transform(X[numerical_columns])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_cat = X_train[categorical_columns].values
X_train_num = X_train[numerical_columns].values
X_val_cat = X_val[categorical_columns].values
X_val_num = X_val[numerical_columns].values

#### Model Architecture
- using embedding layer for categorical data and flattening them
-  concatening embedding data with numerical data
- passing it to dense layer  of size 128,64,32,16
- output layer of size 1
- usnig MSE loss function




In [151]:

def create_model(cat_dims, num_dims):
    publisher_input = Input(shape=(1,), name='publisher_input')
    categories_input = Input(shape=(1,), name='categories_input')
    numeric_input = Input(shape=(num_dims,), name='numeric_input')
    
    publisher_embedding = Embedding(input_dim=cat_dims[0], output_dim=32, name='publisher_embedding')(publisher_input)
    categories_embedding = Embedding(input_dim=cat_dims[1], output_dim=32, name='categories_embedding')(categories_input)

    
    publisher_flat = Flatten()(publisher_embedding)
    categories_flat = Flatten()(categories_embedding)
    
    concatenated = Concatenate()([publisher_flat, categories_flat, numeric_input])
    x = Dense(128, activation='relu')(concatenated)
    x = Dense(64, activation='relu')(x)
    x = Dense(32, activation='relu')(x)
    x = Dense(16, activation='relu')(x)
    output = Dense(1)(x)
    
    model = Model(inputs=[publisher_input, categories_input, numeric_input], outputs=output)
    
    return model

In [152]:
cat_dims = [X['publisher_encoded'].nunique(), X['categories_encoded'].nunique()]
num_dims = len(numerical_columns)

In [153]:
model = create_model(cat_dims, num_dims)

In [154]:
model.compile(optimizer='adam', loss='mse')

In [155]:
model.summary()

Model: "model_8"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 publisher_input (InputLayer)   [(None, 1)]          0           []                               
                                                                                                  
 categories_input (InputLayer)  [(None, 1)]          0           []                               
                                                                                                  
 publisher_embedding (Embedding  (None, 1, 32)       406848      ['publisher_input[0][0]']        
 )                                                                                                
                                                                                                  
 categories_embedding (Embeddin  (None, 1, 32)       3200        ['categories_input[0][0]'] 

In [156]:
X_train_cat_list = [X_train_cat[:, i] for i in range(X_train_cat.shape[1])]
X_val_cat_list = [X_val_cat[:, i] for i in range(X_val_cat.shape[1])]

In [157]:

#using early stopping to prevent overfitting
early_stopping = EarlyStopping(
    monitor='val_loss',  
    patience=10,         
    mode='min',          
    verbose=1,        
    restore_best_weights=True  
)


In [158]:
history = model.fit(
    X_train_cat_list + [X_train_num], y_train,
    validation_data=(X_val_cat_list + [X_val_num], y_val),
    epochs=100,
    batch_size=64,
    verbose=1,
    callbacks=[early_stopping] 
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 19: early stopping


In [160]:
val_loss = model.evaluate(X_val_cat_list + [X_val_num], y_val)
print(f'Validation Loss (MSE): {val_loss:.4f}')

Validation Loss (MSE): 3912.1702


In [161]:
predictions = model.predict(X_val_cat_list + [X_val_num])



In [162]:
mape = np.mean(np.abs((y_val - predictions.flatten()) / y_val)) * 100
print(f'Mean Absolute Percentage Error (MAPE): {mape:.2f}%')

Mean Absolute Percentage Error (MAPE): 5.87%
