In [4]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GRU, SimpleRNN, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [11]:
# Load the dataset
file_path = "E:\mtech\CV\Dataset-1.xlsx"
df = pd.read_excel(file_path, sheet_name="train")

# Combine TITLE and ABSTRACT for text processing
df['text'] = df['TITLE'] + ' ' + df['ABSTRACT']

# Labels for multi-label classification
labels = ['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance']

# Text cleaning function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text

df['text'] = df['text'].apply(clean_text)

# Tokenize text
tokenized_text = [word_tokenize(text) for text in df['text']]

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=tokenized_text, vector_size=100, window=5, min_count=2, workers=4)
vocab_size = len(word2vec_model.wv)

# Create word index
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['text'])
word_index = tokenizer.word_index

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(df['text'])
max_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

# Prepare embedding matrix
embedding_dim = 100
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    if word in word2vec_model.wv:
        embedding_matrix[i] = word2vec_model.wv[word]

# Prepare labels
y = df[labels].values  # One-hot encoded labels

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, y, test_size=0.2, random_state=42)

# Convert one-hot labels to categorical labels (0-5)
y_train = np.argmax(y_train, axis=1)
y_test = np.argmax(y_test, axis=1)

# Print shapes and unique values
print("X_train shape:", X_train.shape)
print("y_train shape (single-label):", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape (single-label):", y_test.shape)

print("Unique labels in y_train_single:", np.unique(y_train,return_counts=True))
print("Unique labels in y_test_single:", np.unique(y_test,return_counts=True))


X_train shape: (16777, 462)
y_train shape (single-label): (16777,)
X_test shape: (4195, 462)
y_test shape (single-label): (4195,)
Unique labels in y_train_single: (array([0, 1, 2, 3, 4, 5], dtype=int64), array([6902, 4391, 3538, 1422,  355,  169], dtype=int64))
Unique labels in y_test_single: (array([0, 1, 2, 3, 4, 5], dtype=int64), array([1692, 1130,  898,  343,   92,   40], dtype=int64))


In [12]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GRU, SimpleRNN, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Compute class weights to handle class imbalance
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}

# Model training function
def build_and_train_model(model_type):
    model = Sequential()
    model.add(Embedding(input_dim=len(word_index) + 1, output_dim=embedding_dim, weights=[embedding_matrix],
                        input_length=max_length, trainable=False))

    if model_type == "LSTM":
        model.add(LSTM(128, return_sequences=False))
    elif model_type == "GRU":
        model.add(GRU(128, return_sequences=False))
    elif model_type == "RNN":
        model.add(SimpleRNN(128, return_sequences=False))

    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(6, activation='softmax'))  # 6 output classes

    model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

    # Train with class weights to balance dataset
    model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test),
              verbose=1, class_weight=class_weight_dict)
    
    return model

# Train and evaluate models
for model_type in ["LSTM", "GRU", "RNN"]:
    print(f"\nTraining {model_type} model...")
    model = build_and_train_model(model_type)
    
    y_pred = np.argmax(model.predict(X_test), axis=1)  # Get predicted class index
    print(f"\nClassification Report for {model_type}:")
    print(classification_report(y_test, y_pred, target_names=labels))


Training LSTM model...
Epoch 1/10




[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 98ms/step - accuracy: 0.2101 - loss: 1.7849 - val_accuracy: 0.0095 - val_loss: 1.8038
Epoch 2/10
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 106ms/step - accuracy: 0.0907 - loss: 1.8176 - val_accuracy: 0.4033 - val_loss: 1.7853
Epoch 3/10
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 124ms/step - accuracy: 0.1795 - loss: 1.7851 - val_accuracy: 0.0095 - val_loss: 1.7933
Epoch 4/10
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 105ms/step - accuracy: 0.1640 - loss: 1.7475 - val_accuracy: 0.0095 - val_loss: 1.7929
Epoch 5/10
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 102ms/step - accuracy: 0.0480 - loss: 1.8000 - val_accuracy: 0.0095 - val_loss: 1.7931
Epoch 6/10
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 130ms/step - accuracy: 0.0335 - loss: 1.8100 - val_accuracy: 0.4033 - val_loss: 1.7900
Epoch 7/10
[1m525/525

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 128ms/step - accuracy: 0.2002 - loss: 1.7804 - val_accuracy: 0.0219 - val_loss: 1.7936
Epoch 2/10
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 129ms/step - accuracy: 0.1983 - loss: 1.7790 - val_accuracy: 0.0219 - val_loss: 1.7958
Epoch 3/10
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 129ms/step - accuracy: 0.1462 - loss: 1.7583 - val_accuracy: 0.0219 - val_loss: 1.7978
Epoch 4/10
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 130ms/step - accuracy: 0.0476 - loss: 1.7827 - val_accuracy: 0.0219 - val_loss: 1.7949
Epoch 5/10
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 130ms/step - accuracy: 0.1258 - loss: 1.7912 - val_accuracy: 0.0818 - val_loss: 1.7910
Epoch 6/10
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 108ms/step - accuracy: 0.1031 - loss: 1.7881 - val_accuracy: 0.4033 - val_loss: 1.7909
Epoch 7/10
[1m525/52

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 28ms/step - accuracy: 0.1705 - loss: 1.8962 - val_accuracy: 0.1142 - val_loss: 1.7279
Epoch 2/10
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 29ms/step - accuracy: 0.1538 - loss: 1.8087 - val_accuracy: 0.0462 - val_loss: 1.8137
Epoch 3/10
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 29ms/step - accuracy: 0.1379 - loss: 1.8292 - val_accuracy: 0.1290 - val_loss: 1.7866
Epoch 4/10
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 29ms/step - accuracy: 0.1082 - loss: 1.8223 - val_accuracy: 0.0095 - val_loss: 1.8136
Epoch 5/10
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 29ms/step - accuracy: 0.0652 - loss: 1.7792 - val_accuracy: 0.1447 - val_loss: 1.8018
Epoch 6/10
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 29ms/step - accuracy: 0.1059 - loss: 1.7627 - val_accuracy: 0.0138 - val_loss: 1.8146
Epoch 7/10
[1m525/525[0m 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
