In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from gensim.models import Word2Vec
from transformers import BertTokenizer, BertModel

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(df['clean_text'], df['label'], test_size=0.2, random_state=42)

# N-grams (TF-IDF and Count Vectorizer)
tfidf = TfidfVectorizer(ngram_range=(1, 2))  # Unigrams & Bigrams
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

count_vect = CountVectorizer(ngram_range=(1, 2))
X_train_count = count_vect.fit_transform(X_train)
X_test_count = count_vect.transform(X_test)

# Word2Vec Embeddings
X_train_tokens = [text.split() for text in X_train]
X_test_tokens = [text.split() for text in X_test]
word2vec_model = Word2Vec(sentences=X_train_tokens, vector_size=100, window=5, min_count=1, workers=4)

def get_w2v_vector(text, model):
    vectors = [model.wv[word] for word in text.split() if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(100)

X_train_w2v = np.array([get_w2v_vector(text, word2vec_model) for text in X_train])
X_test_w2v = np.array([get_w2v_vector(text, word2vec_model) for text in X_test])


# BERT Embeddings
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

X_train_bert = np.array([get_bert_embedding(text) for text in X_train])
X_test_bert = np.array([get_bert_embedding(text) for text in X_test])

# Train a Logistic Regression model on TF-IDF features
clf_tfidf = LogisticRegression()
clf_tfidf.fit(X_train_tfidf, y_train)
y_pred_tfidf = clf_tfidf.predict(X_test_tfidf)
print("TF-IDF Model Accuracy:", accuracy_score(y_test, y_pred_tfidf))

# Train on Word2Vec features
clf_w2v = LogisticRegression()
clf_w2v.fit(X_train_w2v, y_train)
y_pred_w2v = clf_w2v.predict(X_test_w2v)
print("Word2Vec Model Accuracy:", accuracy_score(y_test, y_pred_w2v))

# Train on BERT features
clf_bert = LogisticRegression()
clf_bert.fit(X_train_bert, y_train)
y_pred_bert = clf_bert.predict(X_test_bert)
print("BERT Model Accuracy:", accuracy_score(y_test, y_pred_bert))

# Print classification report
print("\nTF-IDF Classification Report:")
print(classification_report(y_test, y_pred_tfidf))
print("\nTF-IDF Classification Report:")
print(classification_report(y_test, y_pred_tfidf))
print("\nWord2Vec Classification Report:")
print(classification_report(y_test, y_pred_w2v))
print("\nBERT Classification Report:")
print(classification_report(y_test, y_pred_bert))

In [1]:
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

# Extract word vectors
word_vectors = word2vec_model.wv

# Get the most common words for visualization
words = list(word_vectors.index_to_key)[:100]  # Top 100 words
word_vecs = [word_vectors[word] for word in words]

# Reduce dimensions using t-SNE
tsne = TSNE(n_components=2, random_state=42, perplexity=15)
word_vecs_2d = tsne.fit_transform(word_vecs)

# Plot
plt.figure(figsize=(12, 8))
plt.scatter(word_vecs_2d[:, 0], word_vecs_2d[:, 1], alpha=0.7)

# Annotate words
for i, word in enumerate(words):
    plt.annotate(word, (word_vecs_2d[i, 0], word_vecs_2d[i, 1]), fontsize=10)

plt.title("Word2Vec Word Embeddings Visualization")
plt.show()


NameError: name 'word2vec_model' is not defined