In [1]:
!pip install datasets
import numpy as np
import pandas as pd
from datasets import load_dataset
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from sklearn.model_selection import KFold
import nltk

nltk.download('punkt')

# Load datasets
imdb = load_dataset('stanfordnlp/imdb')
sst2 = load_dataset('glue', 'sst2')

imdbTr = imdb['train']
imdbTe = imdb['test']
sst2Tr = sst2['train']
sst2Te = sst2['validation']

trainList = [example for example in imdbTr] + [example for example in sst2Tr]

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Downloading readme:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Downloading readme:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [2]:
# Add is_long feature
sth = 30

def add_is_long(example, threshold=sth):
    if 'text' in example:
        tokens = nltk.word_tokenize(str(example['text']))
        example['is_long'] = int(len(tokens) > threshold)
    elif 'sentence' in example:
        tokens = nltk.word_tokenize(str(example['sentence']))
        example['is_long'] = int(len(tokens) > threshold)
    return example

trainList = [add_is_long(example) for example in trainList]


In [3]:
vectorizer = TfidfVectorizer(max_features=10000)
train_texts = [example.get('text', example.get('sentence', '')) for example in trainList]
X_train_tfidf = vectorizer.fit_transform(train_texts)
X_train_tfidf = np.hstack([X_train_tfidf.toarray(), np.array([example['is_long'] for example in trainList]).reshape(-1, 1)])
y_train = np.array([example['label'] for example in trainList])

In [4]:
# KFold Cross-Validation
n_models = 55
n_splits = 10
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

accuracies = []
f1_scores = []
conf_matrices = []

for fold, (train_index, val_index) in enumerate(kf.split(X_train_tfidf)):
    print(f"Fold {fold + 1}/{n_splits}")
    
    X_train_fold, X_val_fold = X_train_tfidf[train_index], X_train_tfidf[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

    dt_model = DecisionTreeClassifier(max_depth=4, random_state=42)
    booster = AdaBoostClassifier(base_estimator=dt_model, n_estimators=n_models, random_state=42)
    booster.fit(X_train_fold, y_train_fold)

    final_preds = booster.predict(X_val_fold)

    test_accuracy = accuracy_score(y_val_fold, final_preds)
    f1 = f1_score(y_val_fold, final_preds)
    conf_matrix = confusion_matrix(y_val_fold, final_preds)

    accuracies.append(test_accuracy)
    f1_scores.append(f1)
    conf_matrices.append(conf_matrix)

    print(f"Accuracy: {test_accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("Confusion Matrix:\n", conf_matrix)

# Average metrics across folds
avg_accuracy = np.mean(accuracies)
avg_f1_score = np.mean(f1_scores)
avg_conf_matrix = np.mean(conf_matrices, axis=0)

print(f"Average Accuracy: {avg_accuracy:.4f}")
print(f"Average F1 Score: {avg_f1_score:.4f}")
print("Average Confusion Matrix:\n", avg_conf_matrix)

Fold 1/10




Accuracy: 0.7537
F1 Score: 0.7959
Confusion Matrix:
 [[2523 1726]
 [ 549 4437]]
Fold 2/10




Accuracy: 0.7553
F1 Score: 0.7961
Confusion Matrix:
 [[2562 1723]
 [ 537 4413]]
Fold 3/10




Accuracy: 0.7507
F1 Score: 0.7929
Confusion Matrix:
 [[2526 1743]
 [ 559 4407]]
Fold 4/10




Accuracy: 0.7469
F1 Score: 0.7927
Confusion Matrix:
 [[2430 1758]
 [ 579 4468]]
Fold 5/10




Accuracy: 0.7501
F1 Score: 0.7956
Confusion Matrix:
 [[2435 1741]
 [ 567 4492]]
Fold 6/10




Accuracy: 0.7503
F1 Score: 0.7908
Confusion Matrix:
 [[2571 1691]
 [ 615 4358]]
Fold 7/10




Accuracy: 0.7466
F1 Score: 0.7900
Confusion Matrix:
 [[2494 1738]
 [ 602 4401]]
Fold 8/10




Accuracy: 0.7435
F1 Score: 0.7916
Confusion Matrix:
 [[2368 1797]
 [ 572 4498]]
Fold 9/10




Accuracy: 0.7498
F1 Score: 0.7914
Confusion Matrix:
 [[2540 1686]
 [ 625 4384]]
Fold 10/10




Accuracy: 0.7534
F1 Score: 0.7946
Confusion Matrix:
 [[2552 1676]
 [ 601 4405]]
Average Accuracy: 0.7500
Average F1 Score: 0.7932
Average Confusion Matrix:
 [[2500.1 1727.9]
 [ 580.6 4426.3]]
