# installations

In [4]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


# uploading data

In [6]:
import pandas as pd

# Use raw string (r'...') to avoid issues with backslashes in Windows paths
file_path = r'FilteredData100.xlsx'

# Read the Excel file into a DataFrame
df = pd.read_excel(file_path)

# Preview the first few rows
df.head()


Unnamed: 0,Y1,Y2,Y,Domain,area,keywords,Abstract
0,0,12,12,CS,Symbolic computation,(2+1)-dimensional non-linear optical waves; e...,(2 + 1)-dimensional non-linear optical waves t...
1,5,2,74,Medical,Alzheimer's Disease,Aging; Tau; Amyloid; PET; Alzheimer's disease...,(beta-amyloid (A beta) and tau pathology becom...
2,4,7,68,Civil,Green Building,LED lighting system; PV system; Distributed l...,(D)ecreasing of energy consumption and environ...
3,1,10,26,ECE,Electric motor,NdFeB magnets; Electric motor; Electric vehic...,(Hybrid) electric vehicles are assumed to play...
4,5,43,115,Medical,Parkinson's Disease,Parkinson's disease; dyskinesia; adenosine A(...,"(L)-3,4-Dihydroxyphenylalanine ((L)-DOPA) rema..."


In [5]:
import os, random, numpy as np, pandas as pd, torch
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          TrainingArguments, Trainer, DataCollatorWithPadding)
import evaluate

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

In [7]:
SEED = 42

## TF-IDF With XGBoost

In [9]:
# Two-step 70/15/15 split on 'area'
train_df, temp_df = train_test_split(
    df,
    test_size=0.30,
    stratify=df["Domain"],
    random_state=SEED
)
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.50,
    stratify=temp_df["Domain"],
    random_state=SEED
)

### On Domain and Area:

In [10]:
def run_ngram_experiment(ngram_range, train_df, val_df, test_df, label_encoder, label_column, seed=SEED):
    print(f"\n=== Evaluating on label column: '{label_column}' | N-gram range: {ngram_range} ===")

    # TF-IDF vectorization
    vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=1000000)
    X_train = vectorizer.fit_transform(train_df["Abstract"])
    X_val   = vectorizer.transform(val_df["Abstract"])

    # Encode labels
    y_train = label_encoder.fit_transform(train_df[label_column])
    y_val   = label_encoder.transform(val_df[label_column])

    # Train model
    model = XGBClassifier(random_state=seed, use_label_encoder=False, eval_metric="mlogloss")
    model.fit(X_train, y_train)

    # Evaluate on validation
    val_preds = model.predict(X_val)
    val_acc = accuracy_score(y_val, val_preds)
    print(f"Validation Accuracy: {val_acc:.4f}")
    print("Validation Classification Report:")
    print(classification_report(y_val, val_preds, target_names=label_encoder.classes_))


Lowercases the text

Tokenizes using a regular expression: words of 2+ characters (\b\w\w+\b)

Removes punctuation

Builds unigrams & bigrams from the tokens

In [None]:
label_encoder = LabelEncoder()
for label_col in ["Domain", "area"]:
    for ngram_range in [(1, 1), (2, 2), (1, 2)]:
        run_ngram_experiment(ngram_range, train_df, val_df, test_df, label_encoder, label_col)


=== Evaluating on label column: 'Domain' | N-gram range: (1, 1) ===


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Validation Accuracy: 0.8786
Validation Classification Report:
               precision    recall  f1-score   support

          CS        0.89      0.93      0.91       971
       Civil        0.91      0.87      0.89       633
         ECE        0.92      0.90      0.91       807
         MAE        0.89      0.86      0.88       495
     Medical        0.86      0.90      0.88      2165
 Psychology         0.86      0.83      0.84      1071
biochemistry        0.87      0.83      0.85       851

     accuracy                           0.88      6993
    macro avg       0.89      0.87      0.88      6993
 weighted avg       0.88      0.88      0.88      6993


=== Evaluating on label column: 'Domain' | N-gram range: (2, 2) ===


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Validation Accuracy: 0.8268
Validation Classification Report:
               precision    recall  f1-score   support

          CS        0.88      0.85      0.86       971
       Civil        0.94      0.80      0.86       633
         ECE        0.87      0.87      0.87       807
         MAE        0.91      0.77      0.83       495
     Medical        0.75      0.91      0.82      2165
 Psychology         0.83      0.71      0.77      1071
biochemistry        0.88      0.74      0.80       851

     accuracy                           0.83      6993
    macro avg       0.86      0.81      0.83      6993
 weighted avg       0.84      0.83      0.83      6993


=== Evaluating on label column: 'Domain' | N-gram range: (1, 2) ===


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Validation Accuracy: 0.8899
Validation Classification Report:
               precision    recall  f1-score   support

          CS        0.92      0.93      0.92       971
       Civil        0.93      0.88      0.91       633
         ECE        0.93      0.94      0.93       807
         MAE        0.90      0.88      0.89       495
     Medical        0.86      0.91      0.89      2165
 Psychology         0.87      0.84      0.85      1071
biochemistry        0.88      0.82      0.85       851

     accuracy                           0.89      6993
    macro avg       0.90      0.89      0.89      6993
 weighted avg       0.89      0.89      0.89      6993


=== Evaluating on label column: 'area' | N-gram range: (1, 1) ===


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Validation Accuracy: 0.7616
Validation Classification Report:
                                    precision    recall  f1-score   support

                       Addiction         0.60      0.50      0.55        42
                Algorithm design         0.63      0.67      0.65        57
                       Allergies         0.91      0.89      0.90        55
             Alzheimer's Disease         0.77      0.87      0.82        54
            Ambient Intelligence         0.79      0.88      0.83        56
        Analog signal processing         0.80      0.81      0.81        64
          Ankylosing Spondylitis         0.85      0.60      0.70        47
 Antisocial personality disorder         0.75      0.75      0.75        52
                         Anxiety         0.49      0.37      0.42        46
                          Asthma         0.70      0.57      0.63        49
               Atopic Dermatitis         0.81      0.50      0.62        34
             Atrial Fibri

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Validation Accuracy: 0.7150
Validation Classification Report:
                                    precision    recall  f1-score   support

                       Addiction         0.39      0.29      0.33        42
                Algorithm design         0.79      0.74      0.76        57
                       Allergies         0.69      0.62      0.65        55
             Alzheimer's Disease         0.77      0.89      0.83        54
            Ambient Intelligence         0.60      0.71      0.65        56
        Analog signal processing         0.72      0.86      0.79        64
          Ankylosing Spondylitis         0.79      0.57      0.67        47
 Antisocial personality disorder         0.90      0.73      0.81        52
                         Anxiety         0.33      0.20      0.25        46
                          Asthma         0.62      0.53      0.57        49
               Atopic Dermatitis         0.75      0.44      0.56        34
             Atrial Fibri

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Validation Accuracy: 0.7905
Validation Classification Report:
                                    precision    recall  f1-score   support

                       Addiction         0.64      0.55      0.59        42
                Algorithm design         0.82      0.81      0.81        57
                       Allergies         0.94      0.91      0.93        55
             Alzheimer's Disease         0.80      0.89      0.84        54
            Ambient Intelligence         0.75      0.80      0.78        56
        Analog signal processing         0.86      0.84      0.85        64
          Ankylosing Spondylitis         0.74      0.62      0.67        47
 Antisocial personality disorder         0.75      0.75      0.75        52
                         Anxiety         0.54      0.33      0.41        46
                          Asthma         0.72      0.67      0.69        49
               Atopic Dermatitis         0.62      0.44      0.52        34
             Atrial Fibri

### Best configurations:

In [11]:
def run_ngram_experiment(ngram_range, train_df, val_df, test_df, label_encoder, label_column, seed=SEED):
    print(f"\n=== Evaluating on label column: '{label_column}' | N-gram range: {ngram_range} ===")

    # TF-IDF vectorization
    vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=1000000)
    X_train = vectorizer.fit_transform(train_df["Abstract"])

    X_test  = vectorizer.transform(test_df["Abstract"])

    # Encode labels
    y_train = label_encoder.fit_transform(train_df[label_column])
    # y_val   = label_encoder.transform(val_df[label_column])
    y_test  = label_encoder.transform(test_df[label_column])

    # Train model
    model = XGBClassifier(random_state=seed, use_label_encoder=False, eval_metric="mlogloss")
    model.fit(X_train, y_train)

    # Evaluate on test
    test_preds = model.predict(X_test)
    test_acc = accuracy_score(y_test, test_preds)
    print(f"Test Accuracy: {test_acc:.4f}")
    print("Test Classification Report:")
    print(classification_report(y_test, test_preds, target_names=label_encoder.classes_))


In [None]:
label_encoder = LabelEncoder()

# Run the best congfiguration, therefore concat train and validation sets
concat_train_df = pd.concat([train_df, val_df], ignore_index=True)

# domain:
run_ngram_experiment((1, 2), concat_train_df, "-", test_df, label_encoder, "Domain")


=== Evaluating on label column: 'Domain' | N-gram range: (1, 2) ===


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Test Accuracy: 0.8948
Test Classification Report:
               precision    recall  f1-score   support

          CS        0.92      0.94      0.93       970
       Civil        0.94      0.89      0.91       633
         ECE        0.94      0.93      0.93       807
         MAE        0.94      0.85      0.89       494
     Medical        0.86      0.92      0.89      2166
 Psychology         0.88      0.83      0.85      1072
biochemistry        0.89      0.85      0.87       851

     accuracy                           0.89      6993
    macro avg       0.91      0.89      0.90      6993
 weighted avg       0.90      0.89      0.89      6993


=== Evaluating on label column: 'area' | N-gram range: (1, 2) ===


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [12]:
label_encoder = LabelEncoder()

# Run the best congfiguration, therefore concat train and validation sets
concat_train_df = pd.concat([train_df, val_df], ignore_index=True)

# area:
run_ngram_experiment((1, 2), concat_train_df, "-", test_df, label_encoder, "area")


=== Evaluating on label column: 'area' | N-gram range: (1, 2) ===


Parameters: { "use_label_encoder" } are not used.



Test Accuracy: 0.8001
Test Classification Report:
                                    precision    recall  f1-score   support

                       Addiction         0.76      0.61      0.67        46
                Algorithm design         0.80      0.75      0.77        55
                       Allergies         1.00      0.89      0.94        53
             Alzheimer's Disease         0.75      0.80      0.77        55
            Ambient Intelligence         0.72      0.75      0.73        55
        Analog signal processing         0.81      0.85      0.83        59
          Ankylosing Spondylitis         0.67      0.60      0.63        40
 Antisocial personality disorder         0.93      0.83      0.88        47
                         Anxiety         0.45      0.48      0.47        42
                          Asthma         0.73      0.69      0.71        52
               Atopic Dermatitis         0.70      0.52      0.60        44
             Atrial Fibrillation     