In [1]:
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten, Input, Concatenate
import string

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd





In [2]:
filepath = "review_sentiments_dataset.csv"
df = pd.read_csv(filepath)
df = df.drop(['category'], axis=1)
df['processed_text'] = df['text_'].apply(lambda x: x.lower())
df['review_length'] = df['text_'].apply(lambda x: len(x))
df['punctuation_count'] = df['text_'].apply(lambda x: sum(1 for char in x if char in string.punctuation))

In [3]:
X_text = df['processed_text'].values
X_params = df[['neg','neu','pos','compound','review_length', 'punctuation_count']].values
y = df['label'].values

In [4]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [5]:
X_train_text, X_test_text, X_train_params, X_test_params, y_train, y_test = train_test_split(X_text,X_params, y, test_size=0.2, random_state=42)

In [6]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(X_train_text)
X_train_text_seq = tokenizer.texts_to_sequences(X_train_text)
X_test_text_seq = tokenizer.texts_to_sequences(X_test_text)
X_train_text_pad = tf.keras.preprocessing.sequence.pad_sequences(X_train_text_seq)
X_test_text_pad = tf.keras.preprocessing.sequence.pad_sequences(X_test_text_seq, maxlen=X_train_text_pad.shape[1])

In [7]:
import json
tokenizer_config = tokenizer.get_config()
with open('tokenizer_config.json', 'w') as json_file:
    json.dump(tokenizer_config, json_file)

In [8]:
text_input = Input(shape=(X_train_text_pad.shape[1],))
embedding_layer = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100)(text_input)
flattened_text = Flatten()(embedding_layer)
params_input = Input(shape=(X_train_params.shape[1],))
concatenated = Concatenate()([flattened_text, params_input])
dense_layer = Dense(64, activation='relu')(concatenated)
output_layer = Dense(1, activation='sigmoid')(dense_layer)




In [9]:
model = tf.keras.Model(inputs=[text_input, params_input], outputs=output_layer)

In [10]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])




In [11]:
model.fit([X_train_text_pad, X_train_params], y_train, epochs=5, batch_size=32, validation_split=0.2)

Epoch 1/5


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x18c9a1e0f50>

In [12]:
model = tf.keras.models.load_model('multi_input_fake_reviews_model.h5')

In [13]:
print(X_test_text_pad)
print(X_test_params)

[[    0     0     0 ...    57     8   136]
 [    0     0     0 ...   150    11    27]
 [    0     0     0 ...     1   154    24]
 ...
 [    0     0     0 ...   282     6   982]
 [    0     0     0 ...    19    33 28822]
 [    0     0     0 ...  1048     2   210]]
[[1.000e-02 5.690e-01 4.200e-01 9.989e-01 9.360e+02 2.000e+01]
 [0.000e+00 7.610e-01 2.390e-01 9.670e-01 3.700e+02 9.000e+00]
 [0.000e+00 8.030e-01 1.970e-01 7.960e-01 1.810e+02 5.000e+00]
 ...
 [0.000e+00 7.340e-01 2.660e-01 7.003e-01 8.500e+01 2.000e+00]
 [5.800e-02 6.840e-01 2.580e-01 7.825e-01 1.200e+02 1.000e+00]
 [3.500e-02 8.340e-01 1.300e-01 8.876e-01 4.980e+02 1.000e+01]]


In [14]:
loss, accuracy = model.evaluate([X_test_text_pad, X_test_params], y_test)
print("Test Accuracy:", accuracy)

Test Accuracy: 0.9207369685173035


In [15]:
# Save the model to a file
model.save('multi_input_fake_reviews_model.h5')

  saving_api.save_model(


In [16]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_3 (InputLayer)        [(None, 533)]                0         []                            
                                                                                                  
 embedding_1 (Embedding)     (None, 533, 100)             3990700   ['input_3[0][0]']             
                                                                                                  
 flatten_1 (Flatten)         (None, 53300)                0         ['embedding_1[0][0]']         
                                                                                                  
 input_4 (InputLayer)        [(None, 6)]                  0         []                            
                                                                                            