# Load the libraries and data

In [4]:
import pandas as pd
import numpy as np
import nltk
import gensim
from gensim.models import Word2Vec

# Download necessary NLTK resources
nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")

# Load dataset
data = pd.read_csv("/kaggle/input/filtered-and-translated-nlp/filr.csv")

# Extract relevant columns
texts = data["Translated"].astype(str).tolist()
labels = data["Label"].tolist()

print(f"Loaded {len(texts)} samples.")


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
Loaded 480 samples.


# Tokenize and load word2vec and glove

In [5]:
# Tokenize sentences
tokenized_texts = [nltk.word_tokenize(text.lower()) for text in texts]

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=tokenized_texts, vector_size=100, window=5, min_count=2, workers=4)



In [6]:
# Load GloVe embeddings manually
def load_glove_embeddings(glove_path):
    glove_embeddings = {}
    with open(glove_path, "r", encoding="utf-8") as file:
        for line in file:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype="float32")
            glove_embeddings[word] = vector
    return glove_embeddings

# Provide the correct path to your GloVe file
glove_path = "/kaggle/input/glove-global-vectors-for-word-representation/glove.6B.200d.txt"  
glove_embeddings = load_glove_embeddings(glove_path)



# POS Tagging

In [7]:
# Using NLTK for POS tagging
pos_tagged = [nltk.pos_tag(nltk.word_tokenize(text)) for text in texts[:5]]  # First 5 samples for visualization

for i, tags in enumerate(pos_tagged):
    print(f"\nSentence {i+1} POS Tags:")
    print(tags)



Sentence 1 POS Tags:
[('After', 'IN'), ('hacking', 'VBG'), ('state', 'NN'), ('TV', 'NN'), ('by', 'IN'), ('replacing', 'VBG'), ('propaganda', 'NN'), ('images', 'NNS'), ('with', 'IN'), ('real', 'JJ'), ('ones', 'NNS'), ('they', 'PRP'), ('directly', 'RB'), ('addressed', 'VBD'), ('Putin', 'NNP'), ("'s", 'POS'), ('``', '``'), ('Against', 'NNP'), ('us', 'PRP'), ('you', 'PRP'), ('can', 'MD'), ('not', 'RB'), ('win', 'VB'), ("''", "''"), ('and', 'CC'), ('the', 'DT'), ('soldiers', 'NNS'), ('on', 'IN'), ('the', 'DT'), ('front', 'NN'), ('lines', 'NNS'), ('with', 'IN'), ('a', 'DT'), ('call', 'NN'), ('to', 'TO'), ('lay', 'VB'), ('down', 'RP'), ('their', 'PRP$'), ('weapons', 'NNS'), ('.', '.'), ('Today', 'NN'), ('you', 'PRP'), ('can', 'MD'), ('fight', 'VB'), ('a', 'DT'), ('war', 'NN'), ('without', 'IN'), ('firing', 'VBG'), ('a', 'DT'), ('shot', 'NN'), ('.', '.'), ('Respect', 'VB')]

Sentence 2 POS Tags:
[('#', '#'), ('flowers', 'NNS'), ('#', '#'), ('lovers', 'NNS'), ('Make', 'VBP'), ('love', 'IN'), (

## Shape of POS tags 

In [8]:
from collections import Counter
from sklearn.preprocessing import OneHotEncoder

# Extract POS tags for each sentence
def get_pos_tags(sentences):
    return [nltk.pos_tag(nltk.word_tokenize(sent)) for sent in sentences]

# Get POS tags for the dataset
pos_tagged_texts = get_pos_tags(texts)

# Convert POS tags to a list of tag sequences
pos_sequences = [[tag for _, tag in sent] for sent in pos_tagged_texts]

# Get the most common POS tags to create a fixed vocabulary
tag_counter = Counter(tag for seq in pos_sequences for tag in seq)
common_tags = [tag for tag, _ in tag_counter.most_common(50)]  # Take top 50 most common tags

# One-hot encode the POS tags
encoder = OneHotEncoder(handle_unknown="ignore")
encoder.fit(np.array(common_tags).reshape(-1, 1))

def encode_pos_sequence(pos_sequence):
    tag_vectors = encoder.transform(np.array(pos_sequence).reshape(-1, 1)).toarray()
    return np.mean(tag_vectors, axis=0)  # Averaging to get a fixed-size representation

# Convert each sentence's POS sequence into a fixed-length vector
pos_feature_vectors = np.array([encode_pos_sequence(seq) for seq in pos_sequences])

print("POS feature vector shape:", pos_feature_vectors.shape)


POS feature vector shape: (480, 41)


## Sentence Embeddings seeing the sizes

In [9]:
import numpy as np
import nltk

# Convert sentence to Word2Vec embeddings (average of word vectors)
def sentence_vector_w2v(sentence, w2v_model, embedding_dim=100):
    words = nltk.word_tokenize(sentence.lower())
    vectors = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(embedding_dim)

# Convert sentence to GloVe embeddings
def sentence_vector_glove(sentence, glove_embeddings, embedding_dim=100):
    words = nltk.word_tokenize(sentence.lower())
    vectors = [glove_embeddings[word] for word in words if word in glove_embeddings]
    return np.mean(vectors, axis=0) if vectors else np.zeros(embedding_dim)

# Generate embeddings
w2v_vectors = np.array([sentence_vector_w2v(sent, word2vec_model) for sent in texts])
glove_vectors = np.array([sentence_vector_glove(sent, glove_embeddings) for sent in texts])

print("Word2Vec Embeddings Shape:", w2v_vectors.shape)
print("GloVe Embeddings Shape:", glove_vectors.shape)


Word2Vec Embeddings Shape: (480, 100)
GloVe Embeddings Shape: (480, 200)


In [10]:
# Combine embeddings with POS features
w2v_pos_features = np.hstack((w2v_vectors, pos_feature_vectors))
glove_pos_features = np.hstack((glove_vectors, pos_feature_vectors))

print("Word2Vec + POS Shape:", w2v_pos_features.shape)
print("GloVe + POS Shape:", glove_pos_features.shape)


Word2Vec + POS Shape: (480, 141)
GloVe + POS Shape: (480, 241)


# Training SVM and Random Forest

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

def train_and_evaluate(X, y, name):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train SVM
    svm_model = SVC(kernel="linear")
    svm_model.fit(X_train, y_train)
    y_pred_svm = svm_model.predict(X_test)

    # Train Random Forest
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    y_pred_rf = rf_model.predict(X_test)

    # Print results
    print(f"\n🔹 Results for {name} Embeddings 🔹")
    print("📌 SVM Performance:")
    print(classification_report(y_test, y_pred_svm))
    print("📌 Random Forest Performance:")
    print(classification_report(y_test, y_pred_rf))

# Train models with different embeddings
train_and_evaluate(w2v_vectors, labels, "Word2Vec")
train_and_evaluate(glove_vectors, labels, "GloVe")
train_and_evaluate(w2v_pos_features, labels, "Word2Vec + POS")
train_and_evaluate(glove_pos_features, labels, "GloVe + POS")



🔹 Results for Word2Vec Embeddings 🔹
📌 SVM Performance:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        16
           1       0.00      0.00      0.00        20
           2       0.45      1.00      0.62        43
           3       0.00      0.00      0.00        17

    accuracy                           0.45        96
   macro avg       0.11      0.25      0.15        96
weighted avg       0.20      0.45      0.28        96

📌 Random Forest Performance:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        16
           1       0.20      0.20      0.20        20
           2       0.50      0.70      0.58        43
           3       0.44      0.24      0.31        17

    accuracy                           0.40        96
   macro avg       0.29      0.28      0.27        96
weighted avg       0.34      0.40      0.36        96



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



🔹 Results for GloVe Embeddings 🔹
📌 SVM Performance:
              precision    recall  f1-score   support

           0       0.62      0.62      0.62        16
           1       0.57      0.40      0.47        20
           2       0.54      0.74      0.63        43
           3       0.57      0.24      0.33        17

    accuracy                           0.56        96
   macro avg       0.58      0.50      0.51        96
weighted avg       0.57      0.56      0.54        96

📌 Random Forest Performance:
              precision    recall  f1-score   support

           0       0.44      0.25      0.32        16
           1       0.40      0.20      0.27        20
           2       0.49      0.81      0.61        43
           3       0.67      0.24      0.35        17

    accuracy                           0.49        96
   macro avg       0.50      0.37      0.39        96
weighted avg       0.50      0.49      0.45        96


🔹 Results for Word2Vec + POS Embeddings 🔹
📌 SVM

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



🔹 Results for GloVe + POS Embeddings 🔹
📌 SVM Performance:
              precision    recall  f1-score   support

           0       0.73      0.50      0.59        16
           1       0.38      0.40      0.39        20
           2       0.53      0.72      0.61        43
           3       0.50      0.18      0.26        17

    accuracy                           0.52        96
   macro avg       0.54      0.45      0.46        96
weighted avg       0.53      0.52      0.50        96

📌 Random Forest Performance:
              precision    recall  f1-score   support

           0       0.67      0.38      0.48        16
           1       0.50      0.20      0.29        20
           2       0.53      0.91      0.67        43
           3       0.67      0.24      0.35        17

    accuracy                           0.55        96
   macro avg       0.59      0.43      0.45        96
weighted avg       0.57      0.55      0.50        96



In [12]:
from collections import Counter
import pandas as pd

# Dictionary to store misclassified sentences
misclassified = {}

def train_and_evaluate(X, y, name, sentences):
    X_train, X_test, y_train, y_test, sentences_train, sentences_test = train_test_split(
        X, y, sentences, test_size=0.2, random_state=42)

    # Train SVM
    svm_model = SVC(kernel="linear")
    svm_model.fit(X_train, y_train)
    y_pred_svm = svm_model.predict(X_test)

    # Train Random Forest
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    y_pred_rf = rf_model.predict(X_test)

    # Store misclassified samples
    svm_misclassified = [(sent, true, pred) for sent, true, pred in zip(sentences_test, y_test, y_pred_svm) if true != pred]
    rf_misclassified = [(sent, true, pred) for sent, true, pred in zip(sentences_test, y_test, y_pred_rf) if true != pred]

    misclassified[name] = {"SVM": svm_misclassified, "RF": rf_misclassified}

    # Print results
    print(f"\n🔹 Results for {name} Embeddings 🔹")
    print("📌 SVM Performance:")
    print(classification_report(y_test, y_pred_svm))
    print("📌 Random Forest Performance:")
    print(classification_report(y_test, y_pred_rf))

# Train models with different embeddings & track misclassified cases
train_and_evaluate(w2v_vectors, labels, "Word2Vec", texts)
train_and_evaluate(glove_vectors, labels, "GloVe", texts)
train_and_evaluate(w2v_pos_features, labels, "Word2Vec + POS", texts)
train_and_evaluate(glove_pos_features, labels, "GloVe + POS", texts)



🔹 Results for Word2Vec Embeddings 🔹
📌 SVM Performance:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        16
           1       0.00      0.00      0.00        20
           2       0.45      1.00      0.62        43
           3       0.00      0.00      0.00        17

    accuracy                           0.45        96
   macro avg       0.11      0.25      0.15        96
weighted avg       0.20      0.45      0.28        96

📌 Random Forest Performance:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        16
           1       0.20      0.20      0.20        20
           2       0.50      0.70      0.58        43
           3       0.44      0.24      0.31        17

    accuracy                           0.40        96
   macro avg       0.29      0.28      0.27        96
weighted avg       0.34      0.40      0.36        96



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



🔹 Results for GloVe Embeddings 🔹
📌 SVM Performance:
              precision    recall  f1-score   support

           0       0.62      0.62      0.62        16
           1       0.57      0.40      0.47        20
           2       0.54      0.74      0.63        43
           3       0.57      0.24      0.33        17

    accuracy                           0.56        96
   macro avg       0.58      0.50      0.51        96
weighted avg       0.57      0.56      0.54        96

📌 Random Forest Performance:
              precision    recall  f1-score   support

           0       0.44      0.25      0.32        16
           1       0.40      0.20      0.27        20
           2       0.49      0.81      0.61        43
           3       0.67      0.24      0.35        17

    accuracy                           0.49        96
   macro avg       0.50      0.37      0.39        96
weighted avg       0.50      0.49      0.45        96


🔹 Results for Word2Vec + POS Embeddings 🔹
📌 SVM

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



🔹 Results for GloVe + POS Embeddings 🔹
📌 SVM Performance:
              precision    recall  f1-score   support

           0       0.73      0.50      0.59        16
           1       0.38      0.40      0.39        20
           2       0.53      0.72      0.61        43
           3       0.50      0.18      0.26        17

    accuracy                           0.52        96
   macro avg       0.54      0.45      0.46        96
weighted avg       0.53      0.52      0.50        96

📌 Random Forest Performance:
              precision    recall  f1-score   support

           0       0.67      0.38      0.48        16
           1       0.50      0.20      0.29        20
           2       0.53      0.91      0.67        43
           3       0.67      0.24      0.35        17

    accuracy                           0.55        96
   macro avg       0.59      0.43      0.45        96
weighted avg       0.57      0.55      0.50        96



## Viewing the Misclassifications

In [13]:
# Flatten misclassified sentences from all models
all_misclassified = []
for model in misclassified:
    all_misclassified += [sent for sent, _, _ in misclassified[model]["SVM"]]
    all_misclassified += [sent for sent, _, _ in misclassified[model]["RF"]]

# Count occurrences
misclassified_counts = Counter(all_misclassified)

# Get the most commonly misclassified sentences (Top 10)
common_misclassified = misclassified_counts.most_common(10)
print("\n🔹 Most Commonly Misclassified Sentences 🔹")
for sent, count in common_misclassified:
    print(f"{sent} - Misclassified {count} times")



🔹 Most Commonly Misclassified Sentences 🔹
From symbol of peace to megaphone for war, #Putin rewrites history of #Luzniki stadium - Misclassified 8 times
But how nice,with a war going on,with bombs falling,with civilians being killed we also have time to have a laugh. All scene!!! Fuck you shit actors #putin #actors #bugs #UkraineRussiaWar #Ukraine #aveterottoilcaxxo - Misclassified 8 times
Moscow does not rule out Putin-Zelensky meeting, but after understanding #Ukraine war #UkraineRussianWar #russia #putin #bucha #onu #Zelensky - Misclassified 8 times
"The little girl with the candy"-and the rifle!
More and more propaganda and less and less journalism.
Instead of charades Ukrainians need free voices. A heartfelt appeal on my site:
 
#RussiansAreOurBrothers #UkrainiansAreOurBrothers. - Misclassified 8 times
Far away I will go; over sea and land, to say no to war | to those I will see. If there is blood to spill Spill only your own; I no longer follow you. And if you find me, with me I

## Tag wise misclassification

In [14]:
import nltk

def get_pos_tags(sentence):
    words = nltk.word_tokenize(sentence.lower())
    return [tag for _, tag in nltk.pos_tag(words)]

# Store POS tags for misclassified and correctly classified sentences
pos_tags_misclassified = []
pos_tags_correct = []

for model in misclassified:
    for sent, true, pred in misclassified[model]["SVM"]:
        pos_tags_misclassified.extend(get_pos_tags(sent))

    for sent, true, pred in misclassified[model]["RF"]:
        pos_tags_misclassified.extend(get_pos_tags(sent))

# POS tagging for all correct sentences
for sent in texts:
    if sent not in misclassified_counts:
        pos_tags_correct.extend(get_pos_tags(sent))

# Count POS occurrences
pos_misclassified_counts = Counter(pos_tags_misclassified)
pos_correct_counts = Counter(pos_tags_correct)

# Convert to DataFrame for easy comparison
df_pos = pd.DataFrame({"Misclassified": pos_misclassified_counts, "Correct": pos_correct_counts}).fillna(0)
df_pos["Difference"] = df_pos["Misclassified"] - df_pos["Correct"]

# Sort by biggest difference
df_pos = df_pos.sort_values(by="Difference", ascending=False)

print("\n🔹 POS Tag Comparison 🔹")
print(df_pos)  # Top 10 POS differences



🔹 POS Tag Comparison 🔹
      Misclassified  Correct  Difference
VBP           415.0      332        83.0
#            1319.0     1245        74.0
VBG           277.0      218        59.0
VBZ           436.0      392        44.0
PRP           373.0      336        37.0
CD            341.0      306        35.0
:             310.0      285        25.0
VBN           293.0      270        23.0
WP            100.0       84        16.0
JJR            41.0       26        15.0
JJS            14.0       10         4.0
WRB            32.0       32         0.0
FW             17.0       17         0.0
UH              0.0        1        -1.0
RBR            13.0       15        -2.0
POS           106.0      109        -3.0
RBS             0.0        3        -3.0
PDT             6.0       13        -7.0
SYM             0.0        7        -7.0
)              53.0       61        -8.0
WDT            40.0       49        -9.0
EX             25.0       35       -10.0
RP             25.0       36     

# Training again 


In [15]:
from collections import defaultdict
import pandas as pd

# Dictionary to store misclassified sentences
misclassified = defaultdict(lambda: {"SVM": [], "RF": []})
classwise_misclassified = defaultdict(lambda: {"SVM": defaultdict(int), "RF": defaultdict(int)})

def train_and_evaluate(X, y, name, sentences):
    X_train, X_test, y_train, y_test, sentences_train, sentences_test = train_test_split(
        X, y, sentences, test_size=0.2, random_state=42)

    # Train SVM
    svm_model = SVC(kernel="linear")
    svm_model.fit(X_train, y_train)
    y_pred_svm = svm_model.predict(X_test)

    # Train Random Forest
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    y_pred_rf = rf_model.predict(X_test)

    # Store misclassified samples
    for sent, true, pred in zip(sentences_test, y_test, y_pred_svm):
        if true != pred:
            misclassified[name]["SVM"].append((sent, true, pred))
            classwise_misclassified[name]["SVM"][true] += 1

    for sent, true, pred in zip(sentences_test, y_test, y_pred_rf):
        if true != pred:
            misclassified[name]["RF"].append((sent, true, pred))
            classwise_misclassified[name]["RF"][true] += 1

    # Print results
    print(f"\n🔹 Results for {name} Embeddings 🔹")
    print("📌 SVM Performance:")
    print(classification_report(y_test, y_pred_svm))
    print("📌 Random Forest Performance:")
    print(classification_report(y_test, y_pred_rf))

# Train models with different embeddings & track misclassified cases
train_and_evaluate(w2v_vectors, labels, "Word2Vec", texts)
train_and_evaluate(glove_vectors, labels, "GloVe", texts)
train_and_evaluate(w2v_pos_features, labels, "Word2Vec + POS", texts)
train_and_evaluate(glove_pos_features, labels, "GloVe + POS", texts)



🔹 Results for Word2Vec Embeddings 🔹
📌 SVM Performance:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        16
           1       0.00      0.00      0.00        20
           2       0.45      1.00      0.62        43
           3       0.00      0.00      0.00        17

    accuracy                           0.45        96
   macro avg       0.11      0.25      0.15        96
weighted avg       0.20      0.45      0.28        96

📌 Random Forest Performance:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        16
           1       0.20      0.20      0.20        20
           2       0.50      0.70      0.58        43
           3       0.44      0.24      0.31        17

    accuracy                           0.40        96
   macro avg       0.29      0.28      0.27        96
weighted avg       0.34      0.40      0.36        96



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



🔹 Results for GloVe Embeddings 🔹
📌 SVM Performance:
              precision    recall  f1-score   support

           0       0.62      0.62      0.62        16
           1       0.57      0.40      0.47        20
           2       0.54      0.74      0.63        43
           3       0.57      0.24      0.33        17

    accuracy                           0.56        96
   macro avg       0.58      0.50      0.51        96
weighted avg       0.57      0.56      0.54        96

📌 Random Forest Performance:
              precision    recall  f1-score   support

           0       0.44      0.25      0.32        16
           1       0.40      0.20      0.27        20
           2       0.49      0.81      0.61        43
           3       0.67      0.24      0.35        17

    accuracy                           0.49        96
   macro avg       0.50      0.37      0.39        96
weighted avg       0.50      0.49      0.45        96


🔹 Results for Word2Vec + POS Embeddings 🔹
📌 SVM

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



🔹 Results for GloVe + POS Embeddings 🔹
📌 SVM Performance:
              precision    recall  f1-score   support

           0       0.73      0.50      0.59        16
           1       0.38      0.40      0.39        20
           2       0.53      0.72      0.61        43
           3       0.50      0.18      0.26        17

    accuracy                           0.52        96
   macro avg       0.54      0.45      0.46        96
weighted avg       0.53      0.52      0.50        96

📌 Random Forest Performance:
              precision    recall  f1-score   support

           0       0.67      0.38      0.48        16
           1       0.50      0.20      0.29        20
           2       0.53      0.91      0.67        43
           3       0.67      0.24      0.35        17

    accuracy                           0.55        96
   macro avg       0.59      0.43      0.45        96
weighted avg       0.57      0.55      0.50        96



## Classwise comparison

In [16]:
print("\n🔹 Class-Wise Misclassification Counts 🔹")
for model in classwise_misclassified:
    print(f"\n📌 {model} Embeddings")
    
    for clf in ["SVM", "RF"]:
        total_misclassified = sum(classwise_misclassified[model][clf].values())  # Calculate total misclassified
        
        print(f"🔹 {clf} Misclassification Counts (Total: {total_misclassified})")
        for label, count in classwise_misclassified[model][clf].items():
            print(f"    Class {label}: {count} misclassified")



🔹 Class-Wise Misclassification Counts 🔹

📌 Word2Vec Embeddings
🔹 SVM Misclassification Counts (Total: 53)
    Class 3: 17 misclassified
    Class 1: 20 misclassified
    Class 0: 16 misclassified
🔹 RF Misclassification Counts (Total: 58)
    Class 3: 13 misclassified
    Class 2: 13 misclassified
    Class 1: 16 misclassified
    Class 0: 16 misclassified

📌 GloVe Embeddings
🔹 SVM Misclassification Counts (Total: 42)
    Class 3: 13 misclassified
    Class 2: 11 misclassified
    Class 1: 12 misclassified
    Class 0: 6 misclassified
🔹 RF Misclassification Counts (Total: 49)
    Class 3: 13 misclassified
    Class 1: 16 misclassified
    Class 2: 8 misclassified
    Class 0: 12 misclassified

📌 Word2Vec + POS Embeddings
🔹 SVM Misclassification Counts (Total: 48)
    Class 3: 17 misclassified
    Class 1: 20 misclassified
    Class 0: 11 misclassified
🔹 RF Misclassification Counts (Total: 57)
    Class 3: 15 misclassified
    Class 1: 18 misclassified
    Class 2: 13 misclassified
    

In [17]:
from collections import Counter

def get_common_misclassified(embedding1, embedding2, clf):
    mis1 = {sent for sent, _, _ in misclassified[embedding1][clf]}
    mis2 = {sent for sent, _, _ in misclassified[embedding2][clf]}
    common = mis1.intersection(mis2)
    return common

# Compare Word2Vec vs. Word2Vec + POS
common_w2v = get_common_misclassified("Word2Vec", "Word2Vec + POS", "SVM")
common_glove = get_common_misclassified("GloVe", "GloVe + POS", "SVM")

print("\n🔹 Commonly Misclassified Between Normal & POS Embeddings 🔹")
print(f"🔹 Word2Vec vs. Word2Vec + POS: {len(common_w2v)} common misclassified sentences")
print(f"🔹 GloVe vs. GloVe + POS: {len(common_glove)} common misclassified sentences")



🔹 Commonly Misclassified Between Normal & POS Embeddings 🔹
🔹 Word2Vec vs. Word2Vec + POS: 48 common misclassified sentences
🔹 GloVe vs. GloVe + POS: 40 common misclassified sentences


## POS wise

In [18]:
import nltk

def get_pos_distribution(sentences):
    pos_counts = Counter()
    for sent in sentences:
        words = nltk.word_tokenize(sent.lower())
        pos_counts.update([tag for _, tag in nltk.pos_tag(words)])
    return pos_counts

# Get POS counts for commonly misclassified sentences
pos_common_w2v = get_pos_distribution(common_w2v)
pos_common_glove = get_pos_distribution(common_glove)

print("\n🔹 POS Distribution in Commonly Misclassified Sentences 🔹")
print("📌 Word2Vec vs. Word2Vec + POS Misclassified Sentences:")
print(pos_common_w2v.most_common(10))

print("\n📌 GloVe vs. GloVe + POS Misclassified Sentences:")
print(pos_common_glove.most_common(10))



🔹 POS Distribution in Commonly Misclassified Sentences 🔹
📌 Word2Vec vs. Word2Vec + POS Misclassified Sentences:
[('NN', 383), ('IN', 194), ('JJ', 186), ('#', 152), ('DT', 135), ('NNS', 132), ('.', 88), (',', 67), ('RB', 59), ('VB', 50)]

📌 GloVe vs. GloVe + POS Misclassified Sentences:
[('NN', 332), ('JJ', 181), ('IN', 166), ('#', 157), ('DT', 109), ('NNS', 109), ('.', 76), (',', 49), ('VBZ', 45), ('CC', 44)]


# Comparing the POS tags between word2vec and glove

In [19]:
import nltk
from collections import Counter

nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")

def get_common_misclassified(embedding1, embedding2, clf):
    """Find common misclassified sentences between two embeddings for a given classifier."""
    mis1 = {sent for sent, _, _ in misclassified[embedding1][clf]}
    mis2 = {sent for sent, _, _ in misclassified[embedding2][clf]}
    common = mis1.intersection(mis2)
    return list(common)  # Convert to list for further processing

# Get common misclassified sentences
common_w2v_svm = get_common_misclassified("Word2Vec", "GloVe", "SVM")
common_w2v_rf = get_common_misclassified("Word2Vec", "GloVe", "RF")


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [20]:
def get_pos_distribution(sentences):
    """Returns the frequency of POS tags in a list of sentences."""
    pos_counts = Counter()
    for sent in sentences:
        words = nltk.word_tokenize(sent.lower())
        pos_counts.update([tag for _, tag in nltk.pos_tag(words)])
    return pos_counts

# Get POS distributions for common misclassified sentences
pos_w2v_svm = get_pos_distribution(common_w2v_svm)
pos_glove_svm = get_pos_distribution(common_w2v_svm)  # Same sentences but for GloVe

pos_w2v_rf = get_pos_distribution(common_w2v_rf)
pos_glove_rf = get_pos_distribution(common_w2v_rf)  # Same sentences but for GloVe


In [21]:
def compare_pos(pos_w2v, pos_glove):
    """Compare POS tag distributions between Word2Vec and GloVe."""
    print("\n🔹 POS Tag Differences (Word2Vec vs. GloVe)")
    for tag in set(pos_w2v.keys()).union(set(pos_glove.keys())):
        diff = pos_glove[tag] - pos_w2v[tag]  # Positive means GloVe captures it more
        print(f"    {tag}: Word2Vec = {pos_w2v[tag]}, GloVe = {pos_glove[tag]}, Difference = {diff}")

print("\n📌 POS Tag Analysis for SVM Misclassifications")
compare_pos(pos_w2v_svm, pos_glove_svm)

print("\n📌 POS Tag Analysis for Random Forest Misclassifications")
compare_pos(pos_w2v_rf, pos_glove_rf)



📌 POS Tag Analysis for SVM Misclassifications

🔹 POS Tag Differences (Word2Vec vs. GloVe)
    .: Word2Vec = 57, GloVe = 57, Difference = 0
    RP: Word2Vec = 2, GloVe = 2, Difference = 0
    WDT: Word2Vec = 3, GloVe = 3, Difference = 0
    '': Word2Vec = 12, GloVe = 12, Difference = 0
    :: Word2Vec = 27, GloVe = 27, Difference = 0
    ): Word2Vec = 2, GloVe = 2, Difference = 0
    WRB: Word2Vec = 3, GloVe = 3, Difference = 0
    NNS: Word2Vec = 84, GloVe = 84, Difference = 0
    VBN: Word2Vec = 22, GloVe = 22, Difference = 0
    RB: Word2Vec = 35, GloVe = 35, Difference = 0
    CC: Word2Vec = 31, GloVe = 31, Difference = 0
    ,: Word2Vec = 37, GloVe = 37, Difference = 0
    WP: Word2Vec = 5, GloVe = 5, Difference = 0
    PDT: Word2Vec = 1, GloVe = 1, Difference = 0
    ``: Word2Vec = 13, GloVe = 13, Difference = 0
    PRP: Word2Vec = 22, GloVe = 22, Difference = 0
    POS: Word2Vec = 10, GloVe = 10, Difference = 0
    DT: Word2Vec = 84, GloVe = 84, Difference = 0
    VB: Word2Vec =

In [22]:
import nltk

nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")

def get_pos_tags(sentence):
    """Returns the POS tags for a given sentence."""
    words = nltk.word_tokenize(sentence.lower())  # Tokenize
    return nltk.pos_tag(words)  # POS tagging

def analyze_sentence_wise_pos(common_sentences):
    """Compares POS tagging for Word2Vec and GloVe on commonly misclassified sentences."""
    for i, sent in enumerate(common_sentences, 1):
        pos_tags = get_pos_tags(sent)

        print(f"\n🔹 Sentence {i}: {sent}")
        print("POS Tags:", pos_tags)


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## SVM and Random Forest Glove

In [23]:
common_w2v_svm = get_common_misclassified("Word2Vec", "GloVe", "SVM")
common_w2v_rf = get_common_misclassified("Word2Vec", "GloVe", "RF")

print("\n📌 POS Tag Analysis for SVM Misclassifications")
analyze_sentence_wise_pos(common_w2v_svm)

print("\n📌 POS Tag Analysis for Random Forest Misclassifications")
analyze_sentence_wise_pos(common_w2v_rf)



📌 POS Tag Analysis for SVM Misclassifications

🔹 Sentence 1: Moscow: "The West has declared total war on Russia" #Ukraine #Putin #Russia
POS Tags: [('moscow', 'NN'), (':', ':'), ('``', '``'), ('the', 'DT'), ('west', 'NN'), ('has', 'VBZ'), ('declared', 'VBN'), ('total', 'JJ'), ('war', 'NN'), ('on', 'IN'), ('russia', 'NN'), ("''", "''"), ('#', '#'), ('ukraine', 'JJ'), ('#', '#'), ('putin', 'JJ'), ('#', '#'), ('russia', 'NN')]

🔹 Sentence 2: Images of the lifeless bodies of citizens on the streets of #Bucha describe the horror of this war and the crimes of which Putin is guilty.
POS Tags: [('images', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('lifeless', 'JJ'), ('bodies', 'NNS'), ('of', 'IN'), ('citizens', 'NNS'), ('on', 'IN'), ('the', 'DT'), ('streets', 'NNS'), ('of', 'IN'), ('#', '#'), ('bucha', 'NN'), ('describe', 'VBZ'), ('the', 'DT'), ('horror', 'NN'), ('of', 'IN'), ('this', 'DT'), ('war', 'NN'), ('and', 'CC'), ('the', 'DT'), ('crimes', 'NNS'), ('of', 'IN'), ('which', 'WDT'), ('putin', '

In [24]:
import nltk

nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")

def get_pos_tags(sentence):
    """Returns the POS tags for a given sentence."""
    words = nltk.word_tokenize(sentence.lower())  # Tokenize
    return nltk.pos_tag(words)  # POS tagging

def analyze_sentence_wise_pos(common_sentences):
    """Compares POS tagging for Word2Vec and GloVe on commonly misclassified sentences."""
    for i, sent in enumerate(common_sentences, 1):
        pos_tags = get_pos_tags(sent)

        print(f"\n🔹 Sentence {i}: {sent}")
        print("POS Tags:", pos_tags)



[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## SVM and Random forest in word2vec

In [25]:
common_w2v_svm = get_common_misclassified("Word2Vec", "GloVe", "SVM")
common_w2v_rf = get_common_misclassified("Word2Vec", "GloVe", "RF")

print("\n📌 POS Tag Analysis for SVM Misclassifications")
analyze_sentence_wise_pos(common_w2v_svm)

print("\n📌 POS Tag Analysis for Random Forest Misclassifications")
analyze_sentence_wise_pos(common_w2v_rf)



📌 POS Tag Analysis for SVM Misclassifications

🔹 Sentence 1: Moscow: "The West has declared total war on Russia" #Ukraine #Putin #Russia
POS Tags: [('moscow', 'NN'), (':', ':'), ('``', '``'), ('the', 'DT'), ('west', 'NN'), ('has', 'VBZ'), ('declared', 'VBN'), ('total', 'JJ'), ('war', 'NN'), ('on', 'IN'), ('russia', 'NN'), ("''", "''"), ('#', '#'), ('ukraine', 'JJ'), ('#', '#'), ('putin', 'JJ'), ('#', '#'), ('russia', 'NN')]

🔹 Sentence 2: Images of the lifeless bodies of citizens on the streets of #Bucha describe the horror of this war and the crimes of which Putin is guilty.
POS Tags: [('images', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('lifeless', 'JJ'), ('bodies', 'NNS'), ('of', 'IN'), ('citizens', 'NNS'), ('on', 'IN'), ('the', 'DT'), ('streets', 'NNS'), ('of', 'IN'), ('#', '#'), ('bucha', 'NN'), ('describe', 'VBZ'), ('the', 'DT'), ('horror', 'NN'), ('of', 'IN'), ('this', 'DT'), ('war', 'NN'), ('and', 'CC'), ('the', 'DT'), ('crimes', 'NNS'), ('of', 'IN'), ('which', 'WDT'), ('putin', '

# A lot going on in here 😬

In [26]:
import nltk
import numpy as np
from scipy.spatial.distance import cosine

nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")

def get_pos_tags(sentence):
    """Returns POS tags for a given sentence."""
    words = nltk.word_tokenize(sentence.lower())  # Tokenize
    return nltk.pos_tag(words)  # POS tagging

def compare_sentence_pos(sentences):
    """Compares POS tags for each misclassified sentence between Word2Vec and GloVe."""
    for i, sent in enumerate(sentences, 1):
        pos_tags = get_pos_tags(sent)

        print(f"\n🔹 Sentence {i}: {sent}")
        print("📌 POS Tags:", pos_tags)


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [27]:
def get_sentence_embedding(sentence, model):
    """Computes the sentence embedding by averaging word vectors."""
    words = nltk.word_tokenize(sentence.lower())
    vectors = [model[word] for word in words if word in model]  # Get word vectors
    return np.mean(vectors, axis=0) if vectors else np.zeros(300)  # Mean embedding

def compare_embeddings(sentences, w2v_model, glove_model):
    """Computes cosine distance between Word2Vec and GloVe embeddings for misclassified sentences."""
    print("\n🔹 Embedding Distance Differences (Word2Vec vs. GloVe) 🔹")
    for i, sent in enumerate(sentences, 1):
        w2v_emb = get_sentence_embedding(sent, w2v_model)
        glove_emb = get_sentence_embedding(sent, glove_model)

        # Compute Cosine Distance
        distance = cosine(w2v_emb, glove_emb)

        print(f"\n📌 Sentence {i}: {sent}")
        print(f"🔹 Cosine Distance: {distance:.4f}")


In [32]:
common_w2v_svm = get_common_misclassified("Word2Vec", "GloVe", "SVM")
common_w2v_rf = get_common_misclassified("Word2Vec", "GloVe", "RF")

print("\n📌 POS Tag Analysis for SVM Misclassifications")
compare_sentence_pos(common_w2v_svm)

print("\n📌 POS Tag Analysis for Random Forest Misclassifications")
compare_sentence_pos(common_w2v_rf)


📌 POS Tag Analysis for SVM Misclassifications

🔹 Sentence 1: Moscow: "The West has declared total war on Russia" #Ukraine #Putin #Russia
📌 POS Tags: [('moscow', 'NN'), (':', ':'), ('``', '``'), ('the', 'DT'), ('west', 'NN'), ('has', 'VBZ'), ('declared', 'VBN'), ('total', 'JJ'), ('war', 'NN'), ('on', 'IN'), ('russia', 'NN'), ("''", "''"), ('#', '#'), ('ukraine', 'JJ'), ('#', '#'), ('putin', 'JJ'), ('#', '#'), ('russia', 'NN')]

🔹 Sentence 2: Images of the lifeless bodies of citizens on the streets of #Bucha describe the horror of this war and the crimes of which Putin is guilty.
📌 POS Tags: [('images', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('lifeless', 'JJ'), ('bodies', 'NNS'), ('of', 'IN'), ('citizens', 'NNS'), ('on', 'IN'), ('the', 'DT'), ('streets', 'NNS'), ('of', 'IN'), ('#', '#'), ('bucha', 'NN'), ('describe', 'VBZ'), ('the', 'DT'), ('horror', 'NN'), ('of', 'IN'), ('this', 'DT'), ('war', 'NN'), ('and', 'CC'), ('the', 'DT'), ('crimes', 'NNS'), ('of', 'IN'), ('which', 'WDT'), ('putin

In [55]:
print("common_w2v_svm:\n") 

for sentence in common_w2v_svm:
    print(sentence)
    print()

print()
print("common_w2v_rf:\n")

for sentence in common_w2v_rf:
    print(sentence)
    print()




common_w2v_svm:

Moscow: "The West has declared total war on Russia" #Ukraine #Putin #Russia

Images of the lifeless bodies of citizens on the streets of #Bucha describe the horror of this war and the crimes of which Putin is guilty.

UKRAINE BOMBS THE #DOMBASS
THE TRUTH ABOUT THE ORIGIN OF THE WAR BETWEEN UKRAINE AND RUSSIA IN A POST STRAIGHT FROM THE BATTLEFIELD
UKRAINE IS PROVOKING THE CLASH BY BOMBING REGIONS THAT WANT INDEPENDENCE SO AS NOT TO JOIN NATO
THE MEDIA ARE LYING ⬇️🇷🇺

Over 5 million refugees from #Ukraine, terrible images of #war have been circulated for months, but meanwhile #Zelensky's wife #Olena is spotted at the beach in Forte dei Marmi in a 4 million euro villa.
If everything is normal for you, I return the line
#19June

LA #GUERRA GIUSTA NON ESISTE!
INVOCARE #PACE INVIANDO #ARMI NON È PER L'EROE CHE COMBATTE, MA PER IL POPOLO CHE VIENE MACELLATO!
SIAMO UN PAESE DI COGLIONI INVASATI!
Sveglia!!!
#StopTheWar
#DraghiVatteneSubito
#NonInMioNome

From symbol of peace t

# Best Comparison 

After this will compare to see where word2vec went wrong
One reason is the size
But we can't give basic answers

In [44]:
# Extract misclassified sentences for the best models
misclassified_glove_svm = {sent for sent, _, _ in misclassified["GloVe"]["SVM"]}
misclassified_glove_pos_rf = {sent for sent, _, _ in misclassified["GloVe + POS"]["RF"]}

# Find common misclassified sentences
common_misclassified = misclassified_glove_svm.intersection(misclassified_glove_pos_rf)

print(f"\n🔹 Total Misclassifications 🔹")
print(f"📌 GloVe + SVM: {len(misclassified_glove_svm)} sentences")
print(f"📌 GloVe + POS + RF: {len(misclassified_glove_pos_rf)} sentences")
print(f"📌 Common Misclassifications: {len(common_misclassified)} sentences")



🔹 Total Misclassifications 🔹
📌 GloVe + SVM: 42 sentences
📌 GloVe + POS + RF: 43 sentences
📌 Common Misclassifications: 32 sentences


In [48]:
import pandas as pd

# Convert misclassified sets into lists with labels
common_misclassified_details = [
    (sent, true_label, pred_svm, pred_rf)
    for sent, true_label, pred_svm in misclassified["GloVe"]["SVM"]
    for _, true_label_rf, pred_rf in misclassified["GloVe + POS"]["RF"]
    if sent == _
]

# Convert to DataFrame for better readability
common_df = pd.DataFrame(common_misclassified_details, columns=["Sentence", "True Label", "Pred (SVM)", "Pred (RF)"])
print("\n🔹 Common Misclassified Sentences 🔹")
common_df  # Show first 10 for analysis



🔹 Common Misclassified Sentences 🔹


Unnamed: 0,Sentence,True Label,Pred (SVM),Pred (RF)
0,"From symbol of peace to megaphone for war, #Pu...",3,2,2
1,LA #GUERRA GIUSTA NON ESISTE!\nINVOCARE #PACE ...,1,0,2
2,"But how nice,with a war going on,with bombs fa...",1,2,2
3,Moscow does not rule out Putin-Zelensky meetin...,3,2,2
4,"""The little girl with the candy""-and the rifle...",1,2,2
5,"Far away I will go; over sea and land, to say ...",3,0,0
6,"Americans, the rapists of War II. While omitti...",0,2,2
7,🇺🇦 The conflict in Ukraine: the context and po...,2,3,3
8,"Over 5 million refugees from #Ukraine, terribl...",0,2,2
9,"Moscow: ""The West has declared total war on Ru...",3,2,2


In [49]:
from collections import Counter
import nltk

nltk.download("stopwords")
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))

# Tokenize sentences and filter out stopwords
all_words = [word.lower() for sent in common_df["Sentence"] for word in nltk.word_tokenize(sent) if word.isalpha()]
filtered_words = [word for word in all_words if word not in stop_words]

# Count occurrences
word_counts = Counter(filtered_words)
print("\n📌 Most Common Words in Misclassified Sentences:")
print(word_counts.most_common(20))


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.

📌 Most Common Words in Misclassified Sentences:
[('war', 22), ('ukraine', 22), ('putin', 17), ('russia', 13), ('moscow', 8), ('million', 6), ('ukrainian', 6), ('peace', 5), ('bucha', 5), ('russian', 5), ('zelensky', 4), ('go', 4), ('weapons', 4), ('history', 3), ('ukrainerussiawar', 3), ('ukrainerussianwar', 3), ('propaganda', 3), ('sea', 3), ('march', 3), ('images', 3)]


In [50]:
nltk.download("averaged_perceptron_tagger")

# Extract POS sequences for misclassified sentences
def get_pos_pattern(sentence):
    tokens = nltk.word_tokenize(sentence)
    pos_tags = [tag for _, tag in nltk.pos_tag(tokens)]
    return " ".join(pos_tags)

common_df["POS Pattern"] = common_df["Sentence"].apply(get_pos_pattern)

# Count POS pattern occurrences
pos_patterns = Counter(common_df["POS Pattern"])
print("\n📌 Most Common POS Patterns in Misclassified Sentences:")
print(pos_patterns.most_common(10))


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!

📌 Most Common POS Patterns in Misclassified Sentences:
[('IN NN IN NN TO VB IN NN , # NNP VBZ NN IN # NNP NN', 1), ('NNP # NNP NNP NNP NNP . NNP # NNP NNP # NNP NNP NNP NNP NNP NNP NNP , NNP NNP NNP NNP NNP NNP NNP . NNP NNP NNP NNP NNP NNP . NN . . . # NNP # NNP # NNP', 1), ('CC WRB JJ , IN DT NN VBG IN , IN NNS VBG , IN NNS VBG VBN PRP RB VBP NN TO VB DT NN . DT NN . . . NNP PRP VBP NNS # JJ # NNS # NNS # NNP # NNP # NN', 1), ('NNP VBZ RB VB IN NNP NN , CC IN JJ # NNP NN # NNP # NN # JJ # JJ # JJ # NNP', 1), ("`` DT JJ NN IN DT NN '' VBP DT NN . JJR CC JJR NNS CC JJR CC RBR NN . RB IN NNS NNS VBP JJ NNS . DT JJ NN IN PRP$ NN : # NNP # NNS .", 1), ('NNP RB PRP MD VB : IN NN CC NN , TO VB DT TO NN NN TO DT PRP MD VB . IN EX VBZ NN TO VB NNP RB PRP$ JJ : PRP RB RBR VB PRP . CC IN PRP VBP PRP , IN

Where did Word2Vec Fail?

In [58]:
import nltk
from collections import Counter
import pandas as pd

nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")

# Convert misclassified sentences into DataFrames
w2v_misclassified_svm = pd.DataFrame(misclassified["Word2Vec"]["SVM"], columns=["Sentence", "True Label", "Predicted Label"])
glove_misclassified_svm = pd.DataFrame(misclassified["GloVe"]["SVM"], columns=["Sentence", "True Label", "Predicted Label"])
w2v_misclassified_rf = pd.DataFrame(misclassified["Word2Vec"]["RF"], columns=["Sentence", "True Label", "Predicted Label"])
glove_misclassified_rf = pd.DataFrame(misclassified["GloVe"]["RF"], columns=["Sentence", "True Label", "Predicted Label"])

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [59]:
# Find common misclassified sentences between different models
common_misclassified_svm = w2v_misclassified_svm.merge(glove_misclassified_svm, on="Sentence", suffixes=("_w2v", "_glove"))
common_misclassified_rf = w2v_misclassified_rf.merge(glove_misclassified_rf, on="Sentence", suffixes=("_w2v", "_glove"))

In [64]:
# Print first few common misclassified sentences
print("\n🔹 Common Misclassified Sentences (W2V + SVM vs. GloVe + SVM)")
common_misclassified_svm.drop(['True Label_glove'], axis=1)


🔹 Common Misclassified Sentences (W2V + SVM vs. GloVe + SVM)


Unnamed: 0,Sentence,True Label_w2v,Predicted Label_w2v,Predicted Label_glove
0,"From symbol of peace to megaphone for war, #Pu...",3,2,2
1,LA #GUERRA GIUSTA NON ESISTE!\nINVOCARE #PACE ...,1,2,0
2,"But how nice,with a war going on,with bombs fa...",1,2,2
3,Moscow does not rule out Putin-Zelensky meetin...,3,2,2
4,"""The little girl with the candy""-and the rifle...",1,2,2
5,Photo from a store window on Moscow's Arbat St...,1,2,2
6,"Far away I will go; over sea and land, to say ...",3,2,0
7,"Americans, the rapists of War II. While omitti...",0,2,2
8,"Over 5 million refugees from #Ukraine, terribl...",0,2,2
9,"Moscow: ""The West has declared total war on Ru...",3,2,2


In [65]:
print("\n🔹 Common Misclassified Sentences (W2V + RF vs. GloVe + RF)")
common_misclassified_rf.drop(['True Label_glove'], axis=1)


🔹 Common Misclassified Sentences (W2V + RF vs. GloVe + RF)


Unnamed: 0,Sentence,True Label_w2v,Predicted Label_w2v,Predicted Label_glove
0,"From symbol of peace to megaphone for war, #Pu...",3,1,2
1,"But how nice,with a war going on,with bombs fa...",1,2,2
2,Moscow does not rule out Putin-Zelensky meetin...,3,2,2
3,"""The little girl with the candy""-and the rifle...",1,2,2
4,"Far away I will go; over sea and land, to say ...",3,2,0
5,"Americans, the rapists of War II. While omitti...",0,2,2
6,"""Being the president of #Ukraine is a lot easi...",0,3,2
7,"Over 5 million refugees from #Ukraine, terribl...",0,2,2
8,"Moscow: ""The West has declared total war on Ru...",3,1,2
9,Images of the lifeless bodies of citizens on t...,3,2,2


# comparing the POS Tags of word2vec and Glove

why? 

Cos when we wanna see if POS is affecting it 
It is not
What else could be the cause?

In [69]:
import nltk
from collections import defaultdict
import pandas as pd

nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")

# Function to get POS pattern
def get_pos_pattern(sentence):
    tokens = nltk.word_tokenize(sentence)
    pos_tags = nltk.pos_tag(tokens)
    return " ".join([tag for _, tag in pos_tags])

# Convert misclassified sentences into DataFrames
w2v_misclassified_svm = pd.DataFrame(misclassified["Word2Vec"]["SVM"], columns=["Sentence", "True Label", "Predicted Label"])
glove_misclassified_svm = pd.DataFrame(misclassified["GloVe"]["SVM"], columns=["Sentence", "True Label", "Predicted Label"])

w2v_misclassified_rf = pd.DataFrame(misclassified["Word2Vec"]["RF"], columns=["Sentence", "True Label", "Predicted Label"])
glove_misclassified_rf = pd.DataFrame(misclassified["GloVe"]["RF"], columns=["Sentence", "True Label", "Predicted Label"])

# Apply POS tagging
w2v_misclassified_svm["POS Pattern"] = w2v_misclassified_svm["Sentence"].apply(get_pos_pattern)
glove_misclassified_svm["POS Pattern"] = glove_misclassified_svm["Sentence"].apply(get_pos_pattern)

w2v_misclassified_rf["POS Pattern"] = w2v_misclassified_rf["Sentence"].apply(get_pos_pattern)
glove_misclassified_rf["POS Pattern"] = glove_misclassified_rf["Sentence"].apply(get_pos_pattern)


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [67]:
# Merge misclassified sentences from both embeddings to find differences
svm_comparison = w2v_misclassified_svm.merge(glove_misclassified_svm, on="Sentence", suffixes=("_w2v", "_glove"))
rf_comparison = w2v_misclassified_rf.merge(glove_misclassified_rf, on="Sentence", suffixes=("_w2v", "_glove"))

# Filter only those where POS patterns are different
svm_mismatch = svm_comparison[svm_comparison["POS Pattern_w2v"] != svm_comparison["POS Pattern_glove"]]
rf_mismatch = rf_comparison[rf_comparison["POS Pattern_w2v"] != rf_comparison["POS Pattern_glove"]]

# Print sentences where POS patterns are different
print("\n🔹 Sentences Where Word2Vec and GloVe Have Different POS Tags (SVM)")
print(svm_mismatch[["Sentence", "POS Pattern_w2v", "POS Pattern_glove"]])

print("\n🔹 Sentences Where Word2Vec and GloVe Have Different POS Tags (Random Forest)")
print(rf_mismatch[["Sentence", "POS Pattern_w2v", "POS Pattern_glove"]])



🔹 Sentences Where Word2Vec and GloVe Have Different POS Tags (SVM)
Empty DataFrame
Columns: [Sentence, POS Pattern_w2v, POS Pattern_glove]
Index: []

🔹 Sentences Where Word2Vec and GloVe Have Different POS Tags (Random Forest)
Empty DataFrame
Columns: [Sentence, POS Pattern_w2v, POS Pattern_glove]
Index: []


# What could be the issues?

Well we don't want the basic reasons of size and blah blah

- *Irony and Sarcasm* : Word2Vec is one word per embedding, we are doing misinformation detection where sarcasm is prevalent. Now it can't capture all the meanings. This is a huge issue.
- Highly training data dependent - so it won't really know the other contexts ( this is already there but had to list it)
- Beyond a sentence it doesn't understand meanings, this is very bad in the context of misinformation. We have many repeating words but not all used in the same context this disadvantages us
- direct relationships

# Why is Glove better?

- Basic answer - global co-occurence matrix
- Global window, even though it also assigns one vector per word, since it operates globally unlike word2vec which operates locally, it does capture the multiple contexts of a word ( positive, negative etc)
- Vocab size
- indirect relationships
- handles polysemy better

# Improvement?

🔹 2. Hybrid Word Embeddings (Combine Word2Vec + GloVe + FastText)
💡 Why?
Each embedding model has strengths:
✔ Word2Vec captures local context.
✔ GloVe captures global co-occurrence patterns.
✔ FastText learns subword information, handling rare/misspelled words.

✅ How It Helps:
✔ Better handling of rare words (FastText).
✔ Combines local and global context (Word2Vec + GloVe).
✔ Improves sarcasm detection (by leveraging different embedding sources).

🔧 How to Implement?
We concatenate embeddings from multiple models:

