In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

# Load data
train_data = pd.read_csv('..\\raw_data\\raw_data\\fulltrain.csv', header=None, names=['label', 'text'])
test_data = pd.read_csv('..\\raw_data\\raw_data\\balancedtest.csv', header=None, names=['label', 'text'])

X_train = train_data['text'].values
y_train = train_data['label'].values
X_test = test_data['text'].values
y_test = test_data['label'].values

# Encode labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)
y_train_categorical = to_categorical(y_train_encoded)
y_test_categorical = to_categorical(y_test_encoded)

# Initialize a TF-IDF Vectorizer and compute TF-IDF vectors for the train and test sets
tfidf_vectorizer = TfidfVectorizer(tokenizer=lambda x: x, analyzer='word',preprocessor=lambda x: x, max_features=10000, ngram_range =(1,2))  # Limiting to 5000 features
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test).toarray()

# Define a model suitable for TF-IDF input
model_tfidf = Sequential()
model_tfidf.add(Dense(512, activation='relu', input_dim=7258))  # The input dimension is 5000, matching the TF-IDF features
model_tfidf.add(Dense(units=len(label_encoder.classes_), activation='softmax'))

# Compile the model
model_tfidf.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model_tfidf.fit(X_train_tfidf, y_train_categorical, epochs=5, validation_data=(X_test_tfidf, y_test_categorical), batch_size=32)
model_tfidf.summary()