In [1]:
# !pip install datasets

from datasets import load_dataset

# Load IMDB dataset
imdb = load_dataset("imdb")

# Check the dataset structure
print(imdb)


# Example review
print(imdb['train'][0])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})
{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and

In [2]:
# Train (25k), Test (25k) → split train into train+validation
imdb = imdb['train'].train_test_split(test_size=0.2, seed=42)

train_data = imdb['train']      # 20k
val_data   = imdb['test']       # 5k
test_data  = load_dataset("imdb", split="test")   # 25k

print(train_data)
print(val_data)
print(test_data)

Dataset({
    features: ['text', 'label'],
    num_rows: 20000
})
Dataset({
    features: ['text', 'label'],
    num_rows: 5000
})
Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})


In [3]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK resources (only once)
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_tfidf(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)       # remove HTML
    text = re.sub(r'[^a-z\s]', '', text)    # remove punctuation/numbers
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    return " ".join(tokens)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [4]:
def preprocess_word2vec(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    return tokens   # return list of tokens (needed for Word2Vec training)


In [5]:
def preprocess_bert(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'http\S+|www\S+', '', text)  # remove URLs
    return text.strip()


In [6]:
imdb_tfidf = {}

imdb_tfidf['train'] = train_data.map(lambda x: {"text_clean": preprocess_tfidf(x["text"])})
imdb_tfidf['val']   = val_data.map(lambda x: {"text_clean": preprocess_tfidf(x["text"])})
imdb_tfidf['test']  = test_data.map(lambda x: {"text_clean": preprocess_tfidf(x["text"])})


Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [7]:
imdb_word2vec = {}

imdb_word2vec['train'] = train_data.map(lambda x: {"text_clean": preprocess_word2vec(x["text"])})
imdb_word2vec['val']   = val_data.map(lambda x: {"text_clean": preprocess_word2vec(x["text"])})
imdb_word2vec['test']  = test_data.map(lambda x: {"text_clean": preprocess_word2vec(x["text"])})


Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [8]:
imdb_bert = {}

imdb_bert['train'] = train_data.map(lambda x: {"text_clean": preprocess_bert(x["text"])})
imdb_bert['val']   = val_data.map(lambda x: {"text_clean": preprocess_bert(x["text"])})
imdb_bert['test']  = test_data.map(lambda x: {"text_clean": preprocess_bert(x["text"])})


Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [None]:
def preview_dataset(dataset, split="train", n=3):
    """
    Preview original and cleaned reviews from the dataset.

    Args:
        dataset: the preprocessed dataset (e.g., imdb_tfidf, imdb_w2v, imdb_bert)
        split: "train", "test", or "validation"
        n: number of samples to preview
    """
    for i in range(n):
        print(f"\nReview {i+1}:")
        print("Original:", dataset[split][i]['text'][:300])  # first 300 chars
        print("Cleaned :", dataset[split][i]['text_clean'][:300])
        print("Label   :", "Positive" if dataset[split][i]['label'] == 1 else "Negative")


In [None]:
preview_dataset(imdb_tfidf, split="train", n=5)


Review 1:
Original: Stage adaptations often have a major fault. They often come out looking like a film camera was simply placed on the stage (Such as "Night Mother"). Sidney Lumet's direction keeps the film alive, which is especially difficult since the picture offered him no real challenge. Still, it's nice to look a
Cleaned : stage adaptation often major fault often come looking like film camera simply placed stage night mother sidney lumets direction keep film alive especially difficult since picture offered real challenge still nice look chemistry michael caine christopher reeve quite brilliant dynamic relationship sur
Label   : Positive

Review 2:
Original: 'The Rookie' was a wonderful movie about the second chances life holds for us and also puts an emotional thought over the audience, making them realize that your dreams can come true. If you loved 'Remember the Titans', 'The Rookie' is the movie for you!! It's the feel good movie of the year and it 
Cleaned : rookie wonderful

In [None]:
preview_dataset(imdb_word2vec, split="train", n=5)


Review 1:
Original: Stage adaptations often have a major fault. They often come out looking like a film camera was simply placed on the stage (Such as "Night Mother"). Sidney Lumet's direction keeps the film alive, which is especially difficult since the picture offered him no real challenge. Still, it's nice to look a
Cleaned : ['stage', 'adaptation', 'often', 'have', 'a', 'major', 'fault', 'they', 'often', 'come', 'out', 'looking', 'like', 'a', 'film', 'camera', 'wa', 'simply', 'placed', 'on', 'the', 'stage', 'such', 'a', 'night', 'mother', 'sidney', 'lumets', 'direction', 'keep', 'the', 'film', 'alive', 'which', 'is', 'especially', 'difficult', 'since', 'the', 'picture', 'offered', 'him', 'no', 'real', 'challenge', 'still', 'it', 'nice', 'to', 'look', 'at', 'for', 'what', 'it', 'is', 'the', 'chemistry', 'between', 'michael', 'caine', 'and', 'christopher', 'reeve', 'is', 'quite', 'brilliant', 'the', 'dynamic', 'of', 'their', 'relationship', 'are', 'surprising', 'caine', 'is', 'fanta

In [None]:
preview_dataset(imdb_bert, split="train", n=5)


Review 1:
Original: Stage adaptations often have a major fault. They often come out looking like a film camera was simply placed on the stage (Such as "Night Mother"). Sidney Lumet's direction keeps the film alive, which is especially difficult since the picture offered him no real challenge. Still, it's nice to look a
Cleaned : stage adaptations often have a major fault. they often come out looking like a film camera was simply placed on the stage (such as "night mother"). sidney lumet's direction keeps the film alive, which is especially difficult since the picture offered him no real challenge. still, it's nice to look a
Label   : Positive

Review 2:
Original: 'The Rookie' was a wonderful movie about the second chances life holds for us and also puts an emotional thought over the audience, making them realize that your dreams can come true. If you loved 'Remember the Titans', 'The Rookie' is the movie for you!! It's the feel good movie of the year and it 
Cleaned : 'the rookie' was

# TF-IDF + LogReg

In [None]:
# === TF–IDF vectorization (fits on train, transforms val/test) ===
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=50000, ngram_range=(1, 2), min_df=2)
X_train_tfidf = tfidf.fit_transform(imdb_tfidf['train']['text_clean'])
X_val_tfidf   = tfidf.transform(imdb_tfidf['val']['text_clean'])
X_test_tfidf  = tfidf.transform(imdb_tfidf['test']['text_clean'])

y_train = np.array(imdb_tfidf['train']['label'])
y_val   = np.array(imdb_tfidf['val']['label'])
y_test  = np.array(imdb_tfidf['test']['label'])

print("Shapes — train/val/test:", X_train_tfidf.shape, X_val_tfidf.shape, X_test_tfidf.shape)


Shapes — train/val/test: (20000, 50000) (5000, 50000) (25000, 50000)


In [None]:
# === Logistic Regression baseline (seed fixed) ===
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

log_reg = LogisticRegression(max_iter=1000, random_state=42, solver='liblinear')
log_reg.fit(X_train_tfidf, y_train)

# Evaluate on validation
y_val_pred = log_reg.predict(X_val_tfidf)
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print("\nValidation Report:\n", classification_report(y_val, y_val_pred, target_names=["Negative", "Positive"]))
print("Validation Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))

# Final evaluation on test
y_test_pred = log_reg.predict(X_test_tfidf)
print("\nTest Accuracy:", accuracy_score(y_test, y_test_pred))
print("\nTest Report:\n", classification_report(y_test, y_test_pred, target_names=["Negative", "Positive"]))
print("Test Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))


Validation Accuracy: 0.89

Validation Report:
               precision    recall  f1-score   support

    Negative       0.89      0.88      0.89      2494
    Positive       0.89      0.90      0.89      2506

    accuracy                           0.89      5000
   macro avg       0.89      0.89      0.89      5000
weighted avg       0.89      0.89      0.89      5000

Validation Confusion Matrix:
 [[2207  287]
 [ 263 2243]]

Test Accuracy: 0.88268

Test Report:
               precision    recall  f1-score   support

    Negative       0.89      0.88      0.88     12500
    Positive       0.88      0.89      0.88     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000

Test Confusion Matrix:
 [[10988  1512]
 [ 1421 11079]]


# Word2Vec + LogReg

In [None]:
!pip install gensim



In [None]:
from gensim.models import Word2Vec

# Train Word2Vec on tokenized text
w2v_model = Word2Vec(
    sentences=imdb_word2vec['train']['text_clean'],  # tokenized sentences
    vector_size=300,   # embedding size
    window=5,          # context window
    min_count=2,       # ignore rare words
    workers=4,
    sg=1,              # skip-gram (better for small datasets)
    seed=42
)

print("Vocabulary size:", len(w2v_model.wv))


Vocabulary size: 44795


In [None]:
import numpy as np

def get_sentence_vector(tokens, model, vector_size=300):
    vecs = [model.wv[w] for w in tokens if w in model.wv]
    if len(vecs) == 0:
        return np.zeros(vector_size)
    return np.mean(vecs, axis=0)

# Train/val/test vectors
X_train_w2v = np.array([get_sentence_vector(tokens, w2v_model, 300) for tokens in imdb_word2vec['train']['text_clean']])
X_val_w2v   = np.array([get_sentence_vector(tokens, w2v_model, 300) for tokens in imdb_word2vec['val']['text_clean']])
X_test_w2v  = np.array([get_sentence_vector(tokens, w2v_model, 300) for tokens in imdb_word2vec['test']['text_clean']])

y_train = np.array(imdb_word2vec['train']['label'])
y_val   = np.array(imdb_word2vec['val']['label'])
y_test  = np.array(imdb_word2vec['test']['label'])

print("Shapes — train/val/test:", X_train_w2v.shape, X_val_w2v.shape, X_test_w2v.shape)


Shapes — train/val/test: (20000, 300) (5000, 300) (25000, 300)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

w2v_clf = LogisticRegression(max_iter=1000, random_state=42, solver='liblinear')
w2v_clf.fit(X_train_w2v, y_train)

# Validation
y_val_pred = w2v_clf.predict(X_val_w2v)
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print("\nValidation Report:\n", classification_report(y_val, y_val_pred, target_names=["Negative", "Positive"]))
print("Validation Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))

# Test
y_test_pred = w2v_clf.predict(X_test_w2v)
print("\nTest Accuracy:", accuracy_score(y_test, y_test_pred))
print("\nTest Report:\n", classification_report(y_test, y_test_pred, target_names=["Negative", "Positive"]))
print("Test Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))


Validation Accuracy: 0.8634

Validation Report:
               precision    recall  f1-score   support

    Negative       0.86      0.87      0.86      2494
    Positive       0.87      0.86      0.86      2506

    accuracy                           0.86      5000
   macro avg       0.86      0.86      0.86      5000
weighted avg       0.86      0.86      0.86      5000

Validation Confusion Matrix:
 [[2160  334]
 [ 349 2157]]

Test Accuracy: 0.8548

Test Report:
               precision    recall  f1-score   support

    Negative       0.85      0.86      0.86     12500
    Positive       0.86      0.85      0.85     12500

    accuracy                           0.85     25000
   macro avg       0.85      0.85      0.85     25000
weighted avg       0.85      0.85      0.85     25000

Test Confusion Matrix:
 [[10777  1723]
 [ 1907 10593]]


# BERT + LogReg

In [None]:
# !pip install transformers -q
import torch
import numpy as np
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')
model.eval()  # freeze weights, no fine-tuning

def get_embeddings_hf(dataset_split):
    # tokenize the whole dataset at once
    inputs = tokenizer(
        list(dataset_split['text_clean']),
        return_tensors='pt',
        truncation=True,
        padding=True,
        max_length=128
    )

    with torch.no_grad():
        outputs = model(**inputs)

    # take [CLS] embeddings
    cls_emb = outputs.last_hidden_state[:, 0, :].numpy()
    return cls_emb

# Generate embeddings
X_train = get_embeddings_hf(imdb_bert['train'])
X_val   = get_embeddings_hf(imdb_bert['val'])
X_test  = get_embeddings_hf(imdb_bert['test'])

y_train = np.array(imdb_bert['train']['label'])
y_val   = np.array(imdb_bert['val']['label'])
y_test  = np.array(imdb_bert['test']['label'])

# Train Logistic Regression
clf = LogisticRegression(max_iter=500, random_state=42)
clf.fit(X_train, y_train)
preds = clf.predict(X_test)

# Evaluate
print("DistilBERT Embedding + LR Accuracy:", accuracy_score(y_test, preds))
print(classification_report(y_test, preds, target_names=["Negative", "Positive"]))

DistilBERT Embedding + LR Accuracy: 0.82104
              precision    recall  f1-score   support

    Negative       0.81      0.83      0.82     12500
    Positive       0.83      0.81      0.82     12500

    accuracy                           0.82     25000
   macro avg       0.82      0.82      0.82     25000
weighted avg       0.82      0.82      0.82     25000



In [12]:
results = {}

# TF-IDF results
tfidf_accuracy = 0.88268
tfidf_report = """
               precision    recall  f1-score   support

    Negative       0.89      0.88      0.88     12500
    Positive       0.88      0.89      0.88     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000
"""
# Manually parse the report string to get precision, recall, f1 for each class
tfidf_metrics = {
    'accuracy': tfidf_accuracy,
    'Negative': {'precision': 0.89, 'recall': 0.88, 'f1-score': 0.88},
    'Positive': {'precision': 0.88, 'recall': 0.89, 'f1-score': 0.88}
}
results['TF-IDF'] = tfidf_metrics


# Word2Vec results
w2v_accuracy = 0.8548
w2v_report = """
               precision    recall  f1-score   support

    Negative       0.85      0.86      0.86     12500
    Positive       0.86      0.85      0.85     12500

    accuracy                           0.85     25000
   macro avg       0.85      0.85      0.85     25000
weighted avg       0.85      0.85      0.85     25000
"""
# Manually parse the report string
w2v_metrics = {
    'accuracy': w2v_accuracy,
    'Negative': {'precision': 0.85, 'recall': 0.86, 'f1-score': 0.86},
    'Positive': {'precision': 0.86, 'recall': 0.85, 'f1-score': 0.85}
}
results['Word2Vec'] = w2v_metrics

# BERT results
bert_accuracy = 0.82104
bert_report = """
              precision    recall  f1-score   support

    Negative       0.81      0.83      0.82     12500
    Positive       0.83      0.81      0.82     12500

    accuracy                           0.82     25000
   macro avg       0.82      0.82      0.82     25000
weighted avg       0.82      0.82      0.82     25000
"""
# Manually parse the report string
bert_metrics = {
    'accuracy': bert_accuracy,
    'Negative': {'precision': 0.81, 'recall': 0.83, 'f1-score': 0.82},
    'Positive': {'precision': 0.83, 'recall': 0.81, 'f1-score': 0.82}
}
results['BERT'] = bert_metrics

# Display the results dictionary
import json
print(json.dumps(results, indent=4))

{
    "TF-IDF": {
        "accuracy": 0.88268,
        "Negative": {
            "precision": 0.89,
            "recall": 0.88,
            "f1-score": 0.88
        },
        "Positive": {
            "precision": 0.88,
            "recall": 0.89,
            "f1-score": 0.88
        }
    },
    "Word2Vec": {
        "accuracy": 0.8548,
        "Negative": {
            "precision": 0.85,
            "recall": 0.86,
            "f1-score": 0.86
        },
        "Positive": {
            "precision": 0.86,
            "recall": 0.85,
            "f1-score": 0.85
        }
    },
    "BERT": {
        "accuracy": 0.82104,
        "Negative": {
            "precision": 0.81,
            "recall": 0.83,
            "f1-score": 0.82
        },
        "Positive": {
            "precision": 0.83,
            "recall": 0.81,
            "f1-score": 0.82
        }
    }
}


## Create results table


In [13]:
import pandas as pd

# Flatten the nested dictionary
flat_results = {}
for approach, metrics in results.items():
    flat_results[approach] = {
        'Accuracy': metrics['accuracy'],
        'Precision (Negative)': metrics['Negative']['precision'],
        'Recall (Negative)': metrics['Negative']['recall'],
        'F1-score (Negative)': metrics['Negative']['f1-score'],
        'Precision (Positive)': metrics['Positive']['precision'],
        'Recall (Positive)': metrics['Positive']['recall'],
        'F1-score (Positive)': metrics['Positive']['f1-score']
    }

# Create DataFrame and transpose for desired shape
results_df = pd.DataFrame(flat_results).T
results_df = results_df[['Accuracy',
                         'Precision (Negative)', 'Recall (Negative)', 'F1-score (Negative)',
                         'Precision (Positive)', 'Recall (Positive)', 'F1-score (Positive)']]

display(results_df)

Unnamed: 0,Accuracy,Precision (Negative),Recall (Negative),F1-score (Negative),Precision (Positive),Recall (Positive),F1-score (Positive)
TF-IDF,0.88268,0.89,0.88,0.88,0.88,0.89,0.88
Word2Vec,0.8548,0.85,0.86,0.86,0.86,0.85,0.85
BERT,0.82104,0.81,0.83,0.82,0.83,0.81,0.82


## Analysis of Text Classification Approaches:

* **Performance Summary:** TF-IDF achieved the highest accuracy (0.883) and F1-scores, performing best among the three approaches in this specific setup. Word2Vec followed (0.855), and BERT (0.821) performed the lowest.
* **TF-IDF Strength:** TF-IDF's strong performance is likely due to its effectiveness in capturing important keywords and n-grams that discriminate between positive and negative reviews, especially in a relatively large dataset like IMDB.
* **Word2Vec Context:** Word2Vec, while capturing semantic relationships between words, performed slightly lower than TF-IDF. This might be because the simple averaging of word vectors (used here) doesn't fully capture sentence-level nuances compared to the weighted importance of terms in TF-IDF.
* **BERT's Unexpected Result:** BERT, a powerful contextual model, showed the lowest performance. This is likely because we used fixed, pre-trained BERT embeddings without fine-tuning the model on the specific IMDB task. Fine-tuning BERT typically yields much better results but is computationally more expensive.
* **Trade-offs: Accuracy vs. Resources:** TF-IDF is generally the fastest and least resource-intensive method. Word2Vec requires more training time to learn embeddings but is still relatively efficient. BERT, even when used for embeddings, is the most computationally demanding, and fine-tuning would significantly increase this cost.
* **Efficiency vs. Complexity:** TF-IDF offers a good balance of performance and efficiency for this task. Word2Vec provides semantic understanding at a moderate cost. BERT has the potential for the highest accuracy but demands significantly more resources and complexity, especially with fine-tuning.
* **Class Performance:** All three models show fairly balanced precision and recall for both Negative and Positive classes, indicating no strong bias towards misclassifying one sentiment over the other. The errors seem relatively evenly distributed.
* **Potential BERT Improvement:** The lower performance of BERT here highlights that using off-the-shelf embeddings may not always outperform simpler methods for certain tasks. Fine-tuning, using a larger `max_length` if relevant, or exploring different pooling strategies for embeddings could significantly improve BERT's results.
* **Common Errors:** Based on the similar precision/recall across classes, common errors likely involve reviews with mixed sentiment, sarcasm, or subtle language that all models struggled to interpret correctly without more advanced techniques or fine-tuning.
* **Conclusion:** For this IMDB sentiment analysis task and with the implemented methods, TF-IDF provided the best performance with reasonable efficiency, serving as a strong baseline. While conceptually more advanced, Word2Vec and especially non-fine-tuned BERT did not surpass the TF-IDF logistic regression model.

## Summary:

### Data Analysis Key Findings

*   TF-IDF achieved the highest accuracy (0.883) and F1-scores, performing best among the three approaches.
*   Word2Vec had an accuracy of 0.855, performing slightly lower than TF-IDF.
*   BERT showed the lowest performance with an accuracy of 0.821, likely due to not being fine-tuned on the specific task.
*   TF-IDF is the most efficient and least resource-intensive method, followed by Word2Vec, while BERT is the most computationally demanding.
*   All three models demonstrated balanced precision and recall for both Negative and Positive classes, indicating no significant classification bias towards one sentiment.

### Insights or Next Steps

*   Investigate fine-tuning the BERT model on the IMDB dataset to potentially improve its performance significantly.
*   Explore different methods for aggregating Word2Vec embeddings (e.g., weighted averaging, using a CNN or LSTM on top of embeddings) to capture more complex sentence structures.
