# Data Science Assignement Part 2 B1

#### Before implementing the code, it is necessary to install and import all required dependencies.

In [None]:
#Download and install the necessary packages
%pip install numpy
%pip install pandas
%pip install matplotlib
%pip install seaborn
%pip install datasets
%pip install scikit-learn
%pip install gensim
%pip install sentence-transformers

In [None]:
#Import the necessary packages
import numpy as np
from datasets import load_dataset
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score,classification_report
from sklearn.linear_model import SGDClassifier
import time

### Having installed and imported all the necessary dependencies, we can now proceed to load our data and create our dataframes

In [5]:
#Function to create the dataset from the hugging face data
def create_dataset(ds):
    return {
        "train": ds["train"].to_pandas(),
        "val": ds["validation"].to_pandas(),
        "test": ds["test"].to_pandas()
    }

#Function to extract the data to X(text to classify) and y(label)
def extract_data(df_split):
    return df_split["text"], df_split["label"]

#Load the data for every label
legal_volume = load_dataset("AI-team-UoA/greek_legal_code", name="volume")
legal_chapter = load_dataset("AI-team-UoA/greek_legal_code", name="chapter")
legal_subject = load_dataset("AI-team-UoA/greek_legal_code", name="subject")

#Create the dataframes for every label
df_volume = create_dataset(legal_volume)
df_chapter = create_dataset(legal_chapter)
df_subject = create_dataset(legal_subject)

#Extract the X(text to classify) and y(label) for training,validation and test set for every label 
X_train_volume, y_train_volume = extract_data(df_volume["train"])
X_val_volume, y_val_volume = extract_data(df_volume["val"])
X_test_volume, y_test_volume = extract_data(df_volume["test"])

X_train_chapter, y_train_chapter = extract_data(df_chapter["train"])
X_val_chapter, y_val_chapter = extract_data(df_chapter["val"])
X_test_chapter, y_test_chapter = extract_data(df_chapter["test"])

X_train_subject, y_train_subject = extract_data(df_subject["train"])
X_val_subject, y_val_subject = extract_data(df_subject["val"])
X_test_subject, y_test_subject = extract_data(df_subject["test"])

### Reduce the sets size by a small factor

In [6]:
sample_factor = 1 # We will use the whole dataset
sample_train = min(int(len(X_train_subject) * sample_factor), len(X_train_volume))
sample_val = min(int(len(X_val_subject) * sample_factor), len(X_val_volume))
sample_test = min(int(len(X_test_subject) * sample_factor), len(X_test_volume))

### Now, let's take a peek at the data

In [5]:
# Sample data and class counts
# Volume-level
print("--- Sample Volume-level Data ---")
display(df_volume["train"].sample(5))
print(f"Number of unique volume classes in train: {y_train_volume.nunique()}\n")

# Chapter-level
print("--- Sample Chapter-level Data ---")
display(df_chapter["train"].sample(5))
print(f"Number of unique chapter classes in train: {y_train_chapter.nunique()}\n")

# Subject-level
print("--- Sample Subject-level Data ---")
display(df_subject["train"].sample(5))
print(f"Number of unique subject classes in train: {y_train_subject.nunique()}")

--- Sample Volume-level Data ---


Unnamed: 0,text,label
16705,"22. ΑΠΟΦΑΣΗ ΥΠΟΥΡΓΟΥ ΥΓΕΙΑΣ, ΠΡΟΝΟΙΑΣ ΚΑΙ ΚΟΙΝ...",19
440,62. ΑΠΟΦΑΣΗ ΠΡΟΕΔΡΟΥ ΤΗΣ ΒΟΥΛΗΣ ΤΩΝ ΕΛΛΗΝΩΝ τη...,46
24894,128. ΑΠΟΦΑΣΗ ΥΠΟΥΡΓΟΥ ΟΙΚΟΝΟΜΙΚΩΝ Αριθ. Σ. 35...,13
8552,28. ΒΑΣΙΛΙΚΟΝ ΔΙΑΤΑΓΜΑ υπ’ αριθ. 722 της 10/29...,0
28063,20. ΑΠΟΦΑΣΗ ΥΠΟΥΡΓΟΥ ΚΟΙΝ. ΑΣΦΑΛΙΣΕΩΝ Αριθ. 12...,19


Number of unique volume classes in train: 47

--- Sample Chapter-level Data ---


Unnamed: 0,text,label
3934,1. ΝΟΜΟΘΕΤ. ΔΙΑΤΑΓΜΑ της 30 Απρ. /21 Μαΐου 192...,311
13739,1. ΑΝΑΓΚ. ΝΟΜΟΣ υπ’ αριθ. 2116 της 28/30 Νοεμ....,271
21161,96. ΑΠΟΦΑΣΗ ΥΠΟΥΡΓΟΥ ΟΙΚΟΝΟΜΙΚΩΝ Αριθ. Ε.14950...,360
25869,1. ΝΟΜΟΘΕΤ. ΔΙΑΤΑΓΜΑ υπ' αριθ. 180 της 30/30 Α...,218
9241,Άρθρον 1987 Επεξεργασία ή μετάπλασις Εάν ο δι...,381


Number of unique chapter classes in train: 386

--- Sample Subject-level Data ---


Unnamed: 0,text,label
28270,54. ΑΠΟΦΑΣΗ ΥΠΟΥΡΓΟΥ ΕΜΠΟΡΙΟΥ Αριθ. ΟΙΚ. Ε3/22...,1649
6677,26. ΑΠΟΦΑΣΙΣ ΥΠΟΥΡΓΩΝ ΕΘΝΙΚΗΣ ΟΙΚΟΝΟΜΙΑΣ ΚΑΙ Ο...,1911
764,24. ΑΠΟΦΑΣΗ ΥΠΟΥΡΓΩΝ ΟΙΚΟΝΟΜΙΚΩΝ ΚΑΙ ΕΡΓΑΣΙΑΣ...,381
28197,7. ΝΟΜΟΣ υπ’ αριθ. 756 της 10/16 Αύγ. 1948 (ΦΕ...,1022
27606,19. ΝΟΜΟΣ 5995 της 17/19 Ιαν. 1934 Περί κυρώσε...,1874


Number of unique subject classes in train: 2143


### We are now moving on to implement our models for each label

#### First, we will use Support Vector Machines (SVM) with Bag-of-Words (BoW) and TF-IDF representation

In [6]:
#Function to run the SVM model with the following configuration
def run_SVM(
    X_train, y_train,
    X_val, y_val,
    X_test, y_test,
    vectorizer_type='tfidf',
    label_name='Label',
    sample_train=sample_train,  #Use this sample of the data by default to train the model
    sample_val=sample_val, 
    sample_test=sample_test,
    max_features=15000,
    param = None, #If this parameter is None use the param_grid to tune hyperparameters
    param_grid={'svm__C': [100, 500, 1000]}
):
    print(f"\n===== Running SVM with {vectorizer_type.upper()} for {label_name.upper()} =====")

    # Sample subsets
    X_train = X_train.sample(n=sample_train, random_state=42)
    y_train = y_train.loc[X_train.index]

    X_val = X_val.sample(n=sample_val, random_state=42)
    y_val = y_val.loc[X_val.index]

    X_test = X_test.sample(n=sample_test, random_state=42)
    y_test = y_test.loc[X_test.index]

    vectorizer = CountVectorizer(max_features=max_features) if vectorizer_type == 'bow' else TfidfVectorizer(max_features=max_features)

    # Hyperparameter tuning
    best_score = -1
    best_alpha = None
    best_c = 0

    if param is not None:
        best_alpha = 1.0 / param
        print(f"Training the SVM for: C={param} alpha={best_alpha:.3f})")
    else:
        print("Tuning hyperparameter C (via alpha=1/C) using validation set:")
        for C in param_grid['svm__C']:
            alpha = 1.0 / C
            model = make_pipeline(
                vectorizer,
                SGDClassifier(loss='hinge', alpha=alpha, max_iter=1000, tol=1e-3, random_state=42)
            )
            start = time.time()
            model.fit(X_train, y_train)
            y_val_pred = model.predict(X_val)
            score = f1_score(y_val, y_val_pred, average='macro', zero_division=0)
            print(f"  C={C:<5} → F1_macro={score:.4f} | Time: {time.time() - start:.2f}s")

            if score > best_score:
                best_c = C
                best_score = score
                best_alpha = alpha
        print(f"Best alpha (from C) based on validation:C={best_c} alpha={best_alpha:.3f} (F1_macro={best_score:.4f})")
    

    model = make_pipeline(
        vectorizer,
        SGDClassifier(loss='hinge', alpha=best_alpha, max_iter=1000, tol=1e-3, random_state=42)
    )
    model.fit(X_train, y_train)

    # Final evaluation
    y_pred = model.predict(X_test)
    print(f"\nFinal Evaluation on Test Set ({label_name}, {vectorizer_type.upper()}):")
    print(classification_report(y_test, y_pred, zero_division=0))

#### Run the SVM for volume label with BoW vectorizer

In [7]:
run_SVM(
    X_train_volume, y_train_volume,
    X_val_volume, y_val_volume,
    X_test_volume, y_test_volume,
    vectorizer_type='bow',
    label_name='volume'
)


===== Running SVM with BOW for VOLUME =====
Tuning hyperparameter C (via alpha=1/C) using validation set:
  C=100   → F1_macro=0.6791 | Time: 22.59s
  C=500   → F1_macro=0.7357 | Time: 24.31s
  C=1000  → F1_macro=0.7239 | Time: 25.42s
Best alpha (from C) based on validation:C=500 alpha=0.002 (F1_macro=0.7357)

Final Evaluation on Test Set (volume, BOW):
              precision    recall  f1-score   support

           0       0.73      0.72      0.72       187
           1       0.77      0.82      0.79       389
           2       0.76      0.68      0.72        78
           3       0.70      0.74      0.72       243
           4       0.82      0.77      0.79       346
           5       0.75      0.72      0.73       109
           6       0.78      0.83      0.80       225
           7       0.77      0.67      0.72       110
           8       0.79      0.64      0.71        92
           9       0.74      0.69      0.72        71
          10       0.83      0.65      0.73     

#### Run the SVM for volume label with Tf-Idf vectorizer

In [8]:
run_SVM(
    X_train_volume, y_train_volume,
    X_val_volume, y_val_volume,
    X_test_volume, y_test_volume,
    vectorizer_type='tf-idf',
    label_name='volume'
)


===== Running SVM with TF-IDF for VOLUME =====
Tuning hyperparameter C (via alpha=1/C) using validation set:
  C=100   → F1_macro=0.5874 | Time: 18.47s
  C=500   → F1_macro=0.7123 | Time: 18.29s
  C=1000  → F1_macro=0.7164 | Time: 18.28s
Best alpha (from C) based on validation:C=1000 alpha=0.001 (F1_macro=0.7164)

Final Evaluation on Test Set (volume, TF-IDF):
              precision    recall  f1-score   support

           0       0.77      0.68      0.72       187
           1       0.67      0.80      0.73       389
           2       0.83      0.58      0.68        78
           3       0.76      0.68      0.72       243
           4       0.80      0.80      0.80       346
           5       0.82      0.59      0.68       109
           6       0.87      0.72      0.79       225
           7       0.87      0.44      0.58       110
           8       0.89      0.64      0.75        92
           9       0.74      0.68      0.71        71
          10       0.87      0.41      0.

#### Run the SVM for chapter label with BoW vectorizer

In [9]:
run_SVM(
    X_train_chapter, y_train_chapter,
    X_val_chapter, y_val_chapter,
    X_test_chapter, y_test_chapter,
    vectorizer_type='bow',
    label_name='chapter'
)


===== Running SVM with BOW for CHAPTER =====
Tuning hyperparameter C (via alpha=1/C) using validation set:
  C=100   → F1_macro=0.3685 | Time: 51.09s
  C=500   → F1_macro=0.4775 | Time: 57.53s
  C=1000  → F1_macro=0.4930 | Time: 62.37s
Best alpha (from C) based on validation:C=1000 alpha=0.001 (F1_macro=0.4930)

Final Evaluation on Test Set (chapter, BOW):
              precision    recall  f1-score   support

           0       0.64      0.86      0.73        43
           1       0.50      0.41      0.45        22
           2       0.33      0.83      0.48         6
           3       0.59      0.62      0.61        16
           4       0.00      0.00      0.00         2
           5       0.80      0.64      0.71        25
           6       0.32      0.46      0.38        13
           7       0.63      0.57      0.60        30
           8       0.66      0.67      0.66        57
           9       0.40      1.00      0.57         2
          10       1.00      0.71      0.83  

#### Run the SVM for chapter label with Tf-Idf vectorizer

In [10]:
run_SVM(
    X_train_chapter, y_train_chapter,
    X_val_chapter, y_val_chapter,
    X_test_chapter, y_test_chapter,
    vectorizer_type='tf-idf',
    label_name='chapter'
)


===== Running SVM with TF-IDF for CHAPTER =====
Tuning hyperparameter C (via alpha=1/C) using validation set:
  C=100   → F1_macro=0.2553 | Time: 37.86s
  C=500   → F1_macro=0.4642 | Time: 34.60s
  C=1000  → F1_macro=0.5353 | Time: 34.62s
Best alpha (from C) based on validation:C=1000 alpha=0.001 (F1_macro=0.5353)

Final Evaluation on Test Set (chapter, TF-IDF):
              precision    recall  f1-score   support

           0       0.71      0.95      0.81        43
           1       0.71      0.55      0.62        22
           2       0.62      0.83      0.71         6
           3       0.71      0.75      0.73        16
           4       0.00      0.00      0.00         2
           5       0.84      0.84      0.84        25
           6       0.27      0.62      0.37        13
           7       0.82      0.47      0.60        30
           8       0.59      0.63      0.61        57
           9       0.40      1.00      0.57         2
          10       1.00      0.57      

#### Run the SVM for subject label with BoW vectorizer

In [11]:
run_SVM(
    X_train_subject, y_train_subject,
    X_val_subject, y_val_subject,
    X_test_subject, y_test_subject,
    vectorizer_type='bow',
    label_name='subject'
)


===== Running SVM with BOW for SUBJECT =====
Tuning hyperparameter C (via alpha=1/C) using validation set:
  C=100   → F1_macro=0.1465 | Time: 160.55s
  C=500   → F1_macro=0.2487 | Time: 184.34s
  C=1000  → F1_macro=0.2763 | Time: 196.28s
Best alpha (from C) based on validation:C=1000 alpha=0.001 (F1_macro=0.2763)

Final Evaluation on Test Set (subject, BOW):
              precision    recall  f1-score   support

           0       0.55      0.75      0.63         8
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         3
           8       0.59      0.91      0.71        11
           9       0.15      0.50      0.24         4
          10       0.08      0.33      0.13         3
          11       1.00      0.33      0.50         3
          12       0.00      0.00      0.00         1
          13       1.00      0.25      0.40         4
          14       0.00      0.00      0.00         1
          17       0.33      0.50      0.4

#### Run the SVM for subject label with Tf-Idf vectorizer

In [12]:
run_SVM(
    X_train_subject, y_train_subject,
    X_val_subject, y_val_subject,
    X_test_subject, y_test_subject,
    vectorizer_type='tf-idf',
    label_name='subject'
)


===== Running SVM with TF-IDF for SUBJECT =====
Tuning hyperparameter C (via alpha=1/C) using validation set:
  C=100   → F1_macro=0.0880 | Time: 137.91s
  C=500   → F1_macro=0.2318 | Time: 121.40s
  C=1000  → F1_macro=0.3045 | Time: 120.59s
Best alpha (from C) based on validation:C=1000 alpha=0.001 (F1_macro=0.3045)

Final Evaluation on Test Set (subject, TF-IDF):
              precision    recall  f1-score   support

           0       0.50      0.75      0.60         8
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         3
           8       0.59      0.91      0.71        11
           9       1.00      0.25      0.40         4
          10       0.00      0.00      0.00         3
          11       1.00      0.33      0.50         3
          12       0.00      0.00      0.00         1
          13       1.00      1.00      1.00         4
          14       0.00      0.00      0.00         1
          17       0.50      0.50   

### Second, we will use Logistic Regression with dense embeddings

In [13]:
#Function to run the Logistic Regression model with the following configuration
def run_Logistic_Regression(
    X_train, y_train,
    X_val, y_val,
    X_test, y_test,
    label_name='Label',
    vector_size=100,
    min_count=2,
    sample_train=sample_train,  #Use this sample of the data by default to train the model
    sample_val=sample_val, 
    sample_test=sample_test,
    param = None, #If this parameter is None use the C_values to tune hyperparameters
    C_values=[0.1, 1, 10],
    max_iter=2000
):
    print(f"\n===== Running Logistic Regression with WORD2VEC for {label_name.upper()} =====")

    # Sample subsets
    X_train = X_train.sample(n=sample_train, random_state=42)
    y_train = y_train.loc[X_train.index]

    X_val = X_val.sample(n=sample_val, random_state=42)
    y_val = y_val.loc[X_val.index]

    X_test = X_test.sample(n=sample_test, random_state=42)
    y_test = y_test.loc[X_test.index]

    # Train Word2Vec model
    tokenized = [text.split() for text in X_train]
    w2v_model = Word2Vec(sentences=tokenized, vector_size=vector_size, window=5, min_count=min_count, workers=4)

    # Embedding function
    def embed_doc(text):
        words = text.split()
        vectors = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
        return np.mean(vectors, axis=0) if vectors else np.zeros(vector_size)

    # Vectorize datasets
    X_train_vec = np.vstack([embed_doc(doc) for doc in X_train])
    X_val_vec = np.vstack([embed_doc(doc) for doc in X_val])
    X_test_vec = np.vstack([embed_doc(doc) for doc in X_test])

    scaler = StandardScaler()
    X_train_vec = scaler.fit_transform(X_train_vec)
    X_val_vec = scaler.transform(X_val_vec)
    X_test_vec = scaler.transform(X_test_vec)

    # Determine best C
    best_C = None
    best_score = -1

    if param is not None:
        best_C = param
        print(f"Training Logistic Regression with fixed C={best_C}")
    else:
        print("Tuning Logistic Regression hyperparameter C using validation set:")
        for C in C_values:
            model = LogisticRegression(C=C, max_iter=max_iter, solver='lbfgs')
            start = time.time()
            model.fit(X_train_vec, y_train)
            y_val_pred = model.predict(X_val_vec)
            score = f1_score(y_val, y_val_pred, average='macro', zero_division=0)
            print(f"  C={C:<5} → F1_macro={score:.4f}| Time: {time.time() - start:.2f}s")
            if score > best_score:
                best_score = score
                best_C = C
        print(f"Best C based on validation: {best_C} (F1_macro={best_score:.4f})")

    model = LogisticRegression(C=best_C, max_iter=max_iter, solver='lbfgs')
    model.fit(X_train_vec, y_train)

    # Evaluation
    y_pred = model.predict(X_test_vec)
    print(f"\nFinal Evaluation on Test Set ({label_name}, WORD2VEC):")
    print(classification_report(y_test, y_pred, zero_division=0))

#### Run the Logistic Regression for volume label

In [14]:
run_Logistic_Regression(
    X_train_volume, y_train_volume,
    X_val_volume, y_val_volume,
    X_test_volume, y_test_volume,
    label_name='volume'
)


===== Running Logistic Regression with WORD2VEC for VOLUME =====
Tuning Logistic Regression hyperparameter C using validation set:
  C=0.1   → F1_macro=0.3249| Time: 4.30s
  C=1     → F1_macro=0.3461| Time: 8.24s
  C=10    → F1_macro=0.3475| Time: 9.45s
Best C based on validation: 10 (F1_macro=0.3475)

Final Evaluation on Test Set (volume, WORD2VEC):
              precision    recall  f1-score   support

           0       0.34      0.25      0.28       187
           1       0.28      0.34      0.31       389
           2       0.27      0.12      0.16        78
           3       0.26      0.19      0.22       243
           4       0.30      0.29      0.29       346
           5       0.50      0.38      0.43       109
           6       0.36      0.31      0.33       225
           7       0.25      0.13      0.17       110
           8       0.37      0.46      0.41        92
           9       0.41      0.20      0.27        71
          10       0.40      0.34      0.37       1

#### Run the Logistic Regression for chapter label

In [15]:
run_Logistic_Regression(
    X_train_chapter, y_train_chapter,
    X_val_chapter, y_val_chapter,
    X_test_chapter, y_test_chapter,
    label_name='chapter'
)


===== Running Logistic Regression with WORD2VEC for CHAPTER =====
Tuning Logistic Regression hyperparameter C using validation set:
  C=0.1   → F1_macro=0.1779| Time: 28.56s
  C=1     → F1_macro=0.2446| Time: 48.16s
  C=10    → F1_macro=0.2495| Time: 74.66s
Best C based on validation: 10 (F1_macro=0.2495)

Final Evaluation on Test Set (chapter, WORD2VEC):
              precision    recall  f1-score   support

           0       0.20      0.14      0.16        43
           1       0.34      0.55      0.42        22
           2       0.25      0.50      0.33         6
           3       0.12      0.19      0.15        16
           4       0.00      0.00      0.00         2
           5       0.16      0.16      0.16        25
           6       0.07      0.15      0.10        13
           7       0.57      0.43      0.49        30
           8       0.27      0.32      0.29        57
           9       0.33      0.50      0.40         2
          10       0.50      0.29      0.36   

#### Run the Logistic Regression for subject label

In [16]:
run_Logistic_Regression(
    X_train_subject, y_train_subject,
    X_val_subject, y_val_subject,
    X_test_subject, y_test_subject,
    label_name='subject'
)


===== Running Logistic Regression with WORD2VEC for SUBJECT =====
Tuning Logistic Regression hyperparameter C using validation set:
  C=0.1   → F1_macro=0.0959| Time: 286.04s
  C=1     → F1_macro=0.1624| Time: 416.51s
  C=10    → F1_macro=0.1667| Time: 734.17s
Best C based on validation: 10 (F1_macro=0.1667)

Final Evaluation on Test Set (subject, WORD2VEC):
              precision    recall  f1-score   support

           0       0.33      0.25      0.29         8
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         3
           6       0.00      0.00      0.00         0
           8       0.33      0.27      0.30        11
           9       0.00      0.00      0.00         4
          10       0.00      0.00      0.00         3
          11       0.00      0.00      0.00         3
          12       0.00      0.00      0.00         1
          13       0.50      0.25      0.33         4
          14       0.00      0.00      0.00

### Last, we will use Random Forest with Tf-Idf

In [8]:
#Function to run the Random Forest model with the following configuration
def run_random_forest(
    X_train, y_train,
    X_val, y_val,
    X_test, y_test,
    label_name='Label',
    sample_train=sample_train, #Use this sample of the data by default to train the model
    sample_val=sample_val, 
    sample_test=sample_test,
    max_features=5000,
    param=None, # If this parameter is None use the n_estimators_grid to tune hyperparameters
    n_estimators_grid=[50,100,200]
):
    print(f"\n===== Running RANDOM FOREST with TF-IDF for {label_name.upper()} =====")

    # Sample subsets
    X_train = X_train.sample(n=sample_train, random_state=42)
    y_train = y_train.loc[X_train.index]
    X_val = X_val.sample(n=sample_val, random_state=42)
    y_val = y_val.loc[X_val.index]
    X_test = X_test.sample(n=sample_test, random_state=42)
    y_test = y_test.loc[X_test.index]

    # Vectorize once
    vectorizer = TfidfVectorizer(max_features=max_features)
    vec_train = vectorizer.fit_transform(X_train)
    vec_val = vectorizer.transform(X_val)
    vec_test = vectorizer.transform(X_test)

    # Hyperparameter tuning or fixed param
    best_score = -1
    best_n = param
        
    if param is not None:
        print(f"Training Random Forest with fixed n_estimators={best_n}")
    else:
        print("Tuning n_estimators using validation set:")
        for n in n_estimators_grid:
            model = RandomForestClassifier(
                n_estimators=n,
                max_depth=20,
                max_features='sqrt',
                random_state=42,
                n_jobs=1
            )
            start = time.time()
            model.fit(vec_train, y_train)
            y_val_pred = model.predict(vec_val)
            score = f1_score(y_val, y_val_pred, average='macro', zero_division=0)
            print(f"  n_estimators={n:<5} → F1_macro={score:.4f} | Time: {time.time() - start:.2f}s")
            if score > best_score:
                best_score = score
                best_n = n
        print(f"Best n_estimators based on validation: {best_n} (F1_macro={best_score:.4f})")

    # Final training
    final_model = model = RandomForestClassifier(
        n_estimators=best_n,
        max_depth=20,
        max_features='sqrt',
        random_state=42,
        n_jobs=1
    )
    final_model.fit(vec_train, y_train)

    # Final evaluation
    y_pred = final_model.predict(vec_test)
    print(f"\nFinal Evaluation on Test Set ({label_name}, RANDOM FOREST):")
    print(classification_report(y_test, y_pred, zero_division=0))

#### Run the Random Forest for volume label

In [9]:
run_random_forest(
    X_train_volume, y_train_volume,
    X_val_volume, y_val_volume,
    X_test_volume, y_test_volume,
    label_name='volume'
)


===== Running RANDOM FOREST with TF-IDF for VOLUME =====
Tuning n_estimators using validation set:
  n_estimators=50    → F1_macro=0.3931 | Time: 11.14s
  n_estimators=100   → F1_macro=0.4017 | Time: 22.09s
  n_estimators=200   → F1_macro=0.4008 | Time: 44.19s
Best n_estimators based on validation: 100 (F1_macro=0.4017)

Final Evaluation on Test Set (volume, RANDOM FOREST):
              precision    recall  f1-score   support

           0       0.97      0.17      0.28       187
           1       0.55      0.73      0.63       389
           2       0.00      0.00      0.00        78
           3       0.66      0.33      0.44       243
           4       0.60      0.70      0.65       346
           5       0.84      0.39      0.53       109
           6       0.86      0.47      0.61       225
           7       0.86      0.05      0.10       110
           8       1.00      0.25      0.40        92
           9       0.67      0.34      0.45        71
          10       0.78    

#### Run the Random Forest for chapter label

In [10]:
run_random_forest(
    X_train_chapter, y_train_chapter,
    X_val_chapter, y_val_chapter,
    X_test_chapter, y_test_chapter,
    label_name='chapter'
)


===== Running RANDOM FOREST with TF-IDF for CHAPTER =====
Tuning n_estimators using validation set:
  n_estimators=50    → F1_macro=0.1973 | Time: 31.70s
  n_estimators=100   → F1_macro=0.2081 | Time: 63.41s
  n_estimators=200   → F1_macro=0.2101 | Time: 124.89s
Best n_estimators based on validation: 200 (F1_macro=0.2101)

Final Evaluation on Test Set (chapter, RANDOM FOREST):
              precision    recall  f1-score   support

           0       0.71      0.56      0.62        43
           1       0.33      0.14      0.19        22
           2       0.60      0.50      0.55         6
           3       0.00      0.00      0.00        16
           4       0.00      0.00      0.00         2
           5       0.00      0.00      0.00        25
           6       0.38      0.23      0.29        13
           7       0.50      0.03      0.06        30
           8       0.55      0.30      0.39        57
           9       0.00      0.00      0.00         2
          10       0.00 

#### Run the Random Forest for subject label

In [11]:
run_random_forest(
    X_train_subject, y_train_subject,
    X_val_subject, y_val_subject,
    X_test_subject, y_test_subject,
    label_name='subject'
)


===== Running RANDOM FOREST with TF-IDF for SUBJECT =====
Tuning n_estimators using validation set:
  n_estimators=50    → F1_macro=0.0742 | Time: 141.85s
  n_estimators=100   → F1_macro=0.0758 | Time: 282.05s
  n_estimators=200   → F1_macro=0.0761 | Time: 565.46s
Best n_estimators based on validation: 200 (F1_macro=0.0761)

Final Evaluation on Test Set (subject, RANDOM FOREST):
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         8
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         3
           8       0.00      0.00      0.00        11
           9       0.00      0.00      0.00         4
          10       0.00      0.00      0.00         3
          11       0.00      0.00      0.00         3
          12       0.00      0.00      0.00         1
          13       0.00      0.00      0.00         4
          14       0.00      0.00      0.00         1
          17       1.0