In [2]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')



[nltk_data] Downloading package punkt to /home/pettugadi-
[nltk_data]     pranav/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/pettugadi-
[nltk_data]     pranav/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/pettugadi-
[nltk_data]     pranav/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
def preprocess_dataset1(file_path):
    # Load data
    df = pd.read_csv(file_path)
    
    # Combine title and abstract
    df['text'] = df['title'] + ' ' + df['abstract']
    
    # Clean and preprocess text
    df['cleaned_text'] = df['text'].apply(clean_text)
    
    # Create a new 'label' column based on which label column has a value of 1
    label_columns = ['Computer Science', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance']
    df['label'] = df[label_columns].idxmax(axis=1).str.replace('label', '')
    
    # Drop the original label columns
    df = df.drop(columns=label_columns)
    
    return df

In [None]:
def load_and_preprocess_data(df, dataset_num):
    
    # Combine title and abstract
    df['text'] = df['title'] + ' ' + df['abstract']
    
    # Clean and preprocess text
    df['cleaned_text'] = df['text'].apply(clean_text)
    
    # Extract features (X)
    X = df['cleaned_text'].tolist()
    
    # Extract labels (y) based on dataset number
    y = df['label'].tolist()    
    return X, y

def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Join tokens back into string
    cleaned_text = ' '.join(tokens)
    
    return cleaned_text

def prepare_sequences(texts, max_words, max_len):
    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    padded_sequences = pad_sequences(sequences, maxlen=max_len)
    return padded_sequences, tokenizer

In [None]:
# 2. Model Building
def build_gru_model(max_words, max_len, embedding_dim, gru_units, num_classes):
    model = Sequential([
        Embedding(max_words, embedding_dim, input_length=max_len),
        GRU(gru_units, return_sequences=True),
        GRU(gru_units),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model



In [None]:
# 3. Training
def train_model(model, X_train, y_train, epochs, batch_size):
    callbacks = [
        EarlyStopping(patience=3, restore_best_weights=True),
        ModelCheckpoint('best_gru_model.h5', save_best_only=True)
    ]
    history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size,
                        validation_split=0.2, callbacks=callbacks)
    return history



In [None]:
# 4. Evaluation
def evaluate_model(model, X_test, y_test):
    loss, accuracy = model.evaluate(X_test, y_test)
    y_pred = model.predict(X_test)
    y_pred_classes = np.argmax(y_pred, axis=1)
    y_true = np.argmax(y_test, axis=1)
    print(classification_report(y_true, y_pred_classes))
    print(confusion_matrix(y_true, y_pred_classes))
    return accuracy



In [None]:
# Main execution
if __name__ == "__main__":
    df = preprocess_dataset1('path_to_dataset.csv')
    # Load and preprocess data
    X, y = load_and_preprocess_data(df, dataset_num=1)
    
    # Prepare sequences
    X_padded, tokenizer = prepare_sequences(X, max_words=10000, max_len=200)
    
    # Encode labels
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X_padded, y_encoded, test_size=0.2, random_state=42)
    
    # Build model
    model = build_gru_model(max_words=10000, max_len=200, embedding_dim=128, gru_units=64, num_classes=len(np.unique(y)))
    
    # Train model
    history = train_model(model, X_train, y_train, epochs=10, batch_size=32)
    
    # Evaluate model
    accuracy = evaluate_model(model, X_test, y_test)
    
    print(f"Final accuracy: {accuracy}")

    # 5. Analysis and Comparison
    # Document your findings and prepare for comparison with other models
