In [None]:
import numpy as np
from datasets import load_dataset, Dataset
from nltk.tokenize import word_tokenize
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import AdaBoostClassifier
import nltk

nltk.download('punkt')

# Load datasets
imdb = load_dataset('stanfordnlp/imdb')
sst2 = load_dataset('glue', 'sst2')

# Separate train and test splits
imdbTr = imdb['train']
imdbTe = imdb['test']
sst2Tr = sst2['train']
sst2Te = sst2['validation']

# Combine datasets
trainList = [example for example in imdbTr] + [example for example in sst2Tr]
testList = [example for example in imdbTe] + [example for example in sst2Te]

train = Dataset.from_list(trainList)
test = Dataset.from_list(testList)

In [None]:
sth = 100

def add_is_long(example, threshold=sth):
    text = example.get('text', example.get('sentence', ''))
    tokens = word_tokenize(str(text))
    example['is_long'] = int(len(tokens) > threshold)
    return example

# Add is_long feature to the datasets
train = train.map(add_is_long)
test = test.map(add_is_long)

trainList = [example for example in train]
testList = [example for example in test]

In [None]:
def encodeDataTFIDF(data, vectorizer):
    texts = [example['text'] for example in data]
    labels = [example['label'] for example in data]
    X = vectorizer.transform(texts)
    return X, labels

vectorizer = TfidfVectorizer(max_features=10000, tokenizer=word_tokenize)
train_texts = [example['text'] for example in trainList]
vectorizer.fit(train_texts)

In [None]:
n_splits = 10
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

accuracies = []

for fold, (train_index, test_index) in enumerate(kf.split(trainList)):
    train_data = [trainList[i] for i in train_index]
    val_data = [trainList[i] for i in test_index]

    X_train_fold, y_train_fold = encodeDataTFIDF(train_data, vectorizer)
    X_val_fold, y_val_fold = encodeDataTFIDF(val_data, vectorizer)

    # Add is_long feature
    X_train_fold = np.hstack([X_train_fold, np.array([train_data[i]['is_long'] for i in range(len(train_data))]).reshape(-1, 1)])
    X_val_fold = np.hstack([X_val_fold, np.array([val_data[i]['is_long'] for i in range(len(val_data))]).reshape(-1, 1)])

    # Use sklearn.ensemble AdaBoostClassifier
    adaBoost = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1, random_state=42), n_estimators=50, random_state=42)
    adaBoost.fit(X_train_fold, y_train_fold)

    # Predict on validation fold
    final_predictions = adaBoost.predict(X_val_fold)

    test_accuracy = accuracy_score(y_val_fold, final_predictions)
    accuracies.append(test_accuracy)

    print(f"Fold {fold + 1} - Accuracy: {test_accuracy:.4f}")


In [None]:
avg_accuracy = np.mean(accuracies)
print(f"Average Accuracy: {avg_accuracy:.4f}")
