In [1]:
!pip install --upgrade --force-reinstall numpy pandas scikit-learn gensim datasets nltk

Collecting numpy
  Downloading numpy-2.3.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.1/62.1 kB[0m [31m646.6 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting pandas
  Downloading pandas-2.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scikit-learn
  Downloading scikit_learn-1.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (17 kB)
Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting python-dateutil>=2.8.2 (from pandas)
  Downloading python_dateutil-2.9.0.post0-py2.py3-none-any.whl.met

In [15]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [16]:
from datasets import load_dataset
dataset = load_dataset("trec")  # No need for trust_remote_code
train_data = dataset['train']
test_data = dataset['test']
train_texts = train_data['text']
train_labels = train_data['coarse_label']
test_texts = test_data['text']
test_labels = test_data['coarse_label']
print("Sample Question:", train_data[0]['text'])
print("Label:", train_data[0]['coarse_label'])

Sample Question: How did serfdom develop in and then leave Russia ?
Label: 2


In [17]:

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer
import numpy as np

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
porter = PorterStemmer()
lancaster = LancasterStemmer()

def preprocess(text, method='none'):
    tokens = nltk.word_tokenize(text.lower())
    tokens = [w for w in tokens if w.isalpha() and w not in stop_words]
    if method == 'porter':
        tokens = [porter.stem(w) for w in tokens]
    elif method == 'lancaster':
        tokens = [lancaster.stem(w) for w in tokens]
    elif method == 'lemmatizer':
        tokens = [lemmatizer.lemmatize(w) for w in tokens]
    return tokens


In [18]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
from gensim.models import Word2Vec, FastText

def run_vectorizer_model(preproc_method, vectorizer_type='bow', classifier_type='logreg'):
    processed_train = [preprocess(t, preproc_method) for t in train_texts]
    processed_test = [preprocess(t, preproc_method) for t in test_texts]

    if vectorizer_type == 'bow':
        vectorizer = CountVectorizer()
        X_train_vec = vectorizer.fit_transform([" ".join(t) for t in processed_train])
        X_test_vec = vectorizer.transform([" ".join(t) for t in processed_test])

    elif vectorizer_type == 'tfidf':
        vectorizer = TfidfVectorizer()
        X_train_vec = vectorizer.fit_transform([" ".join(t) for t in processed_train])
        X_test_vec = vectorizer.transform([" ".join(t) for t in processed_test])

    elif vectorizer_type == 'word2vec':
        model = Word2Vec(sentences=processed_train, vector_size=100, window=5, min_count=1, workers=4)
        X_train_vec = np.array([np.mean([model.wv[w] for w in words if w in model.wv] or [np.zeros(100)], axis=0)
                                for words in processed_train])
        X_test_vec = np.array([np.mean([model.wv[w] for w in words if w in model.wv] or [np.zeros(100)], axis=0)
                               for words in processed_test])

    elif vectorizer_type == 'fasttext':
        model = FastText(sentences=processed_train, vector_size=100, window=5, min_count=1, workers=4)
        X_train_vec = np.array([np.mean([model.wv[w] for w in words if w in model.wv] or [np.zeros(100)], axis=0)
                                for words in processed_train])
        X_test_vec = np.array([np.mean([model.wv[w] for w in words if w in model.wv] or [np.zeros(100)], axis=0)
                               for words in processed_test])
    else:
        return None

    if classifier_type == 'logreg':
        model = LogisticRegression(max_iter=1000)
    elif classifier_type == 'tree':
        model = DecisionTreeClassifier()
    elif classifier_type == 'forest':
        model = RandomForestClassifier()
    elif classifier_type == 'mlp':
        model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300)
    else:
        return None

    model.fit(X_train_vec, train_labels)
    preds = model.predict(X_test_vec)
    acc = accuracy_score(test_labels, preds)
    return acc


In [19]:
results = []
methods = ['none', 'porter', 'lancaster', 'lemmatizer']
vectorizers = ['bow', 'tfidf', 'word2vec', 'fasttext']
classifiers = ['logreg', 'tree', 'forest', 'mlp']

for method in methods:
    for vec in vectorizers:
        for clf in classifiers:
            try:
                acc = run_vectorizer_model(method, vec, clf)
                results.append((vec, method, clf, acc))
                print(f"{vec} + {method} + {clf} => Accuracy: {acc:.4f}")
            except Exception as e:
                print(f"Failed: {vec} + {method} + {clf} with error: {e}")

df_results = pd.DataFrame(results, columns=['Vectorizer', 'Preprocessing', 'Classifier', 'Accuracy'])
df_results.sort_values(by='Accuracy', ascending=False)

bow + none + logreg => Accuracy: 0.7560
bow + none + tree => Accuracy: 0.7280
bow + none + forest => Accuracy: 0.7260
bow + none + mlp => Accuracy: 0.7280
tfidf + none + logreg => Accuracy: 0.7600
tfidf + none + tree => Accuracy: 0.7400
tfidf + none + forest => Accuracy: 0.7320
tfidf + none + mlp => Accuracy: 0.7240
word2vec + none + logreg => Accuracy: 0.2040
word2vec + none + tree => Accuracy: 0.4060
word2vec + none + forest => Accuracy: 0.5320




word2vec + none + mlp => Accuracy: 0.5080
fasttext + none + logreg => Accuracy: 0.3500
fasttext + none + tree => Accuracy: 0.3240
fasttext + none + forest => Accuracy: 0.4720




fasttext + none + mlp => Accuracy: 0.4220
bow + porter + logreg => Accuracy: 0.7520
bow + porter + tree => Accuracy: 0.7180
bow + porter + forest => Accuracy: 0.7200
bow + porter + mlp => Accuracy: 0.6940
tfidf + porter + logreg => Accuracy: 0.7500
tfidf + porter + tree => Accuracy: 0.7080
tfidf + porter + forest => Accuracy: 0.7320
tfidf + porter + mlp => Accuracy: 0.6680
word2vec + porter + logreg => Accuracy: 0.4160
word2vec + porter + tree => Accuracy: 0.4200
word2vec + porter + forest => Accuracy: 0.5620




word2vec + porter + mlp => Accuracy: 0.5580
fasttext + porter + logreg => Accuracy: 0.3380
fasttext + porter + tree => Accuracy: 0.3420
fasttext + porter + forest => Accuracy: 0.4560




fasttext + porter + mlp => Accuracy: 0.4000
bow + lancaster + logreg => Accuracy: 0.7460
bow + lancaster + tree => Accuracy: 0.6980
bow + lancaster + forest => Accuracy: 0.6960
bow + lancaster + mlp => Accuracy: 0.6680
tfidf + lancaster + logreg => Accuracy: 0.7460
tfidf + lancaster + tree => Accuracy: 0.6920
tfidf + lancaster + forest => Accuracy: 0.7120
tfidf + lancaster + mlp => Accuracy: 0.6480
word2vec + lancaster + logreg => Accuracy: 0.4100
word2vec + lancaster + tree => Accuracy: 0.4320
word2vec + lancaster + forest => Accuracy: 0.5260




word2vec + lancaster + mlp => Accuracy: 0.5460
fasttext + lancaster + logreg => Accuracy: 0.3380
fasttext + lancaster + tree => Accuracy: 0.3200
fasttext + lancaster + forest => Accuracy: 0.4380




fasttext + lancaster + mlp => Accuracy: 0.3600
bow + lemmatizer + logreg => Accuracy: 0.7540
bow + lemmatizer + tree => Accuracy: 0.7220
bow + lemmatizer + forest => Accuracy: 0.7120
bow + lemmatizer + mlp => Accuracy: 0.7140
tfidf + lemmatizer + logreg => Accuracy: 0.7480
tfidf + lemmatizer + tree => Accuracy: 0.7260
tfidf + lemmatizer + forest => Accuracy: 0.7360
tfidf + lemmatizer + mlp => Accuracy: 0.6800
word2vec + lemmatizer + logreg => Accuracy: 0.2140
word2vec + lemmatizer + tree => Accuracy: 0.4080
word2vec + lemmatizer + forest => Accuracy: 0.5200




word2vec + lemmatizer + mlp => Accuracy: 0.5440
fasttext + lemmatizer + logreg => Accuracy: 0.3380
fasttext + lemmatizer + tree => Accuracy: 0.3180
fasttext + lemmatizer + forest => Accuracy: 0.4500
fasttext + lemmatizer + mlp => Accuracy: 0.4060




Unnamed: 0,Vectorizer,Preprocessing,Classifier,Accuracy
4,tfidf,none,logreg,0.760
0,bow,none,logreg,0.756
48,bow,lemmatizer,logreg,0.754
16,bow,porter,logreg,0.752
20,tfidf,porter,logreg,0.750
...,...,...,...,...
13,fasttext,none,tree,0.324
45,fasttext,lancaster,tree,0.320
61,fasttext,lemmatizer,tree,0.318
56,word2vec,lemmatizer,logreg,0.214
