In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import LabelEncoder

In [2]:
# Load the data
train_data = pd.read_csv('../data/clean/train_data.csv')
test_data = pd.read_csv('../data/clean/test_data.csv')

In [3]:
test_data=pd.concat([train_data, test_data], ignore_index=True)
test_data.head()

Unnamed: 0,name,IBAN,amount,reference,day,month,year
0,2256,2231,474.96,8058430010276,27,5,2024
1,432,1657,929.15,2745309679523,19,10,2021
2,4044,908,422.29,5433080097489,6,11,2023
3,974,4631,986.96,5441464888189,23,5,2020
4,968,3626,740.15,7705305742367,14,9,2022


In [4]:
len(test_data)

5400

In [5]:
# Separate features and labels
X_train = train_data.drop(columns=['reference'])
y_train = train_data['reference']
X_test, X_val, y_test, y_val = train_test_split(test_data.drop(columns=['reference']),
                                                test_data['reference'],
                                                test_size=0.1,
                                                random_state=42)

In [6]:
# Label encode the target labels
encoder = LabelEncoder()
y_train_encoded = encoder.fit_transform(y_train)
y_val_encoded = encoder.transform(y_val)
y_test_encoded = encoder.transform(y_test)

In [7]:
print(len(y_train))
print(len(y_train_encoded))
print(np.max(y_train_encoded))
print(np.max(y_train))
print(len(encoder.classes_))

5013
5013
5012
wife rate stock
5013


In [11]:
print(X_test.loc[3])

name       974.00
IBAN      4631.00
amount     986.96
day         23.00
month        5.00
year      2020.00
Name: 3, dtype: float64


In [12]:
print(X_test.loc[3].shape)

(6,)


In [17]:
print(y_test[3])

5441464888189


In [42]:
# Define the neural network model
model = Sequential([
    Dense(1024, input_dim=X_train.shape[1], activation='relu'),
    Dense(16, activation='relu'),
    Dense(len(encoder.classes_), activation='softmax')
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [43]:
# Compile the model
model.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [46]:
# Train the model
history = model.fit(X_train, y_train_encoded, epochs=50, batch_size=2048)

Epoch 1/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 100ms/step - accuracy: 2.8285e-04 - loss: 8.5198
Epoch 2/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 110ms/step - accuracy: 2.8285e-04 - loss: 8.5198
Epoch 3/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 96ms/step - accuracy: 2.8285e-04 - loss: 8.5199
Epoch 4/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 116ms/step - accuracy: 9.9741e-05 - loss: 8.5198
Epoch 5/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 100ms/step - accuracy: 9.9741e-05 - loss: 8.5200
Epoch 6/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 94ms/step - accuracy: 1.6078e-04 - loss: 8.5198
Epoch 7/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 149ms/step - accuracy: 1.6078e-04 - loss: 8.5198
Epoch 8/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 116ms/step - accuracy: 2.8285e-04 - loss: 8.5199
Epoch 9/50
[1m3/3[0m [3

KeyboardInterrupt: 

In [22]:
# Evaluate the model on test set
loss, accuracy = model.evaluate(X_test, y_test_encoded)
print(f'Test Accuracy: {accuracy*100:.2f}%')

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.0262 - loss: 11.0986 
Test Accuracy: 2.11%


In [23]:
# Make predictions
predictions = model.predict(X_test)
predicted_classes = encoder.inverse_transform(predictions.argmax(axis=-1))

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step


In [24]:
# Add predictions to the test dataframe
X_test['predicted_reference'] = predicted_classes

# Save the test data with predictions
X_test.to_csv('test_with_predictions.csv', index=False)