### Import necessary dependencies.

In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import os

### Step 1: Load the extracted features and labels

In [5]:
features = np.load('features.npy')

In [21]:
# Assuming your labels are stored in a CSV file, load them
DATASET_FOLDER = '../dataset/'
train = pd.read_csv(os.path.join(DATASET_FOLDER, 'train.csv'))

# Assuming 'label' is the name of the column containing labels
labels = train['entity_value'].values[:900]  # Use the first 900 labels

### Step 2: Split the dataset into training and validation sets

In [22]:
X_train, X_val, y_train, y_val = train_test_split(features, labels, test_size=0.2, random_state=42)

### Step 3: Create a simple neural network model

In [23]:
model = Sequential()

Adding dense layers:

In [24]:
model.add(Dense(256, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dropout(0.5))  # Dropout to prevent overfitting
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Assuming it's a binary classification

### Step 4: Compile the model

In [25]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

### Step 5: Train the model with early stopping to prevent overfitting

In [34]:
# early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=32, callbacks=[early_stopping])

# from sklearn.preprocessing import LabelEncoder

# # Assuming labels are currently strings, convert them to numeric format
# label_encoder = LabelEncoder()
# y_train_encoded = label_encoder.fit_transform(y_train)
# y_val_encoded = label_encoder.transform(y_val)

# early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# history = model.fit(
#     X_train, 
#     y_train_encoded, 
#     validation_data=(X_val, y_val_encoded), 
#     epochs=10, 
#     batch_size=32, 
#     callbacks=[early_stopping]
# )


from sklearn.preprocessing import LabelEncoder

# Combine training and validation labels to fit the encoder
all_labels = np.concatenate([y_train, y_val])

label_encoder = LabelEncoder()
label_encoder.fit(all_labels)

# Encode the labels
y_train_encoded = label_encoder.transform(y_train)
y_val_encoded = label_encoder.transform(y_val)

# Train the model with early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    X_train, 
    y_train_encoded, 
    validation_data=(X_val, y_val_encoded), 
    epochs=10, 
    batch_size=32, 
    callbacks=[early_stopping]
)

# Evaluate the model
val_loss, val_acc = model.evaluate(X_val, y_val_encoded)
print(f"Validation Loss: {val_loss}")
print(f"Validation Accuracy: {val_acc}")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Validation Loss: -34971129479168.0
Validation Accuracy: 0.0055555556900799274
