In [None]:
import spacy
spacy.cli.download("en_core_web_md")

In [None]:
import pandas as pd
import numpy as np
import nltk
import spacy
import gensim
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk import pos_tag, ngrams, FreqDist, ConditionalFreqDist
from collections import defaultdict

# Load dataset
data = pd.read_csv("/kaggle/input/filtered-and-translated-nlp/filr.csv")
texts = data["Translated"].astype(str).tolist()
labels = data["Label"].tolist()
tokenized_texts = [word_tokenize(text.lower()) for text in texts]

# POS Tagging
pos_tagged_texts = [pos_tag(tokens) for tokens in tokenized_texts]

### 1. Word2Vec + Classifiers (SVC & RF) ###
word2vec_model = Word2Vec(sentences=tokenized_texts, vector_size=100, window=5, min_count=1, workers=4)

def get_w2v_features(tokens):
    return np.mean([word2vec_model.wv[word] for word in tokens if word in word2vec_model.wv], axis=0)

X_w2v = np.array([get_w2v_features(tokens) if len(tokens) > 0 else np.zeros(100) for tokens in tokenized_texts])
X_train, X_test, y_train, y_test = train_test_split(X_w2v, labels, test_size=0.2, random_state=42)

svm_classifier = SVC(kernel='linear', probability=True)
random_forest_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
svm_classifier.fit(X_train, y_train)
random_forest_classifier.fit(X_train, y_train)

y_pred_svm = svm_classifier.predict(X_test)
y_pred_rf = random_forest_classifier.predict(X_test)

### 2. GloVe + Classifiers (SVC & RF) ###
nlp = spacy.load("en_core_web_md")  # Pre-trained GloVe vectors

def get_glove_features(text):
    return nlp(text).vector

X_glove = np.array([get_glove_features(text) for text in texts])
X_train_glove, X_test_glove, y_train_glove, y_test_glove = train_test_split(X_glove, labels, test_size=0.2, random_state=42)

svm_classifier_glove = SVC(kernel='linear', probability=True)
random_forest_classifier_glove = RandomForestClassifier(n_estimators=100, random_state=42)
svm_classifier_glove.fit(X_train_glove, y_train_glove)
random_forest_classifier_glove.fit(X_train_glove, y_train_glove)

y_pred_svm_glove = svm_classifier_glove.predict(X_test_glove)
y_pred_rf_glove = random_forest_classifier_glove.predict(X_test_glove)

predictions = {
    "SVM (Word2Vec)": (y_pred_svm, svm_classifier, X_w2v),
    "Random Forest (Word2Vec)": (y_pred_rf, random_forest_classifier, X_w2v),
    "SVM (GloVe)": (y_pred_svm_glove, svm_classifier_glove, X_glove),
    "Random Forest (GloVe)": (y_pred_rf_glove, random_forest_classifier_glove, X_glove),
}

correctly_classified_sentences = {}
misclassified_sentences = {}

for model, (preds, classifier, features) in predictions.items():
    correctly_classified_indices = np.where(preds == y_test)[0]
    misclassified_indices = np.where(preds != y_test)[0]
    correctly_classified_sentences[model] = [(texts[i], pos_tagged_texts[i], classifier.predict_proba([features[i]])[0]) for i in correctly_classified_indices]
    misclassified_sentences[model] = [(texts[i], pos_tagged_texts[i], classifier.predict_proba([features[i]])[0]) for i in misclassified_indices]

for model in correctly_classified_sentences:
    for text, correct_tags, correct_probs in correctly_classified_sentences[model]:
        for other_model in misclassified_sentences:
            misclassified_entries = [entry for entry in misclassified_sentences[other_model] if entry[0] == text]
            for misclassified_entry in misclassified_entries:
                _, misclassified_tags, misclassified_probs = misclassified_entry
                print(f"Sentence: {text}")
                print(f"Correctly classified by: {model}")
                print(f"Misclassified by: {other_model}")
                print(f"Correct POS Tags & Probabilities:")
                for (word, tag), prob in zip(correct_tags, correct_probs):
                    print(f"{word} ({tag}) → {prob:.2f}")
                print(f"Misclassified POS Tags & Probabilities:")
                for (word, tag), prob in zip(misclassified_tags, misclassified_probs):
                    print(f"{word} ({tag}) → {prob:.2f}")
                print("-")

In [None]:
import pandas as pd
import numpy as np
import nltk
import spacy
import gensim
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk import pos_tag, ngrams, FreqDist, ConditionalFreqDist
from collections import defaultdict

# Load dataset
data = pd.read_csv("/kaggle/input/filtered-and-translated-nlp/filr.csv")
texts = data["Translated"].astype(str).tolist()
labels = data["Label"].tolist()
tokenized_texts = [word_tokenize(text.lower()) for text in texts]

# POS Tagging
pos_tagged_texts = [pos_tag(tokens) for tokens in tokenized_texts]

### 1. Word2Vec + Classifiers (SVC & RF) ###
word2vec_model = Word2Vec(sentences=tokenized_texts, vector_size=100, window=5, min_count=1, workers=4)

def get_w2v_features(tokens):
    return np.mean([word2vec_model.wv[word] for word in tokens if word in word2vec_model.wv], axis=0)

X_w2v = np.array([get_w2v_features(tokens) if len(tokens) > 0 else np.zeros(100) for tokens in tokenized_texts])
X_train, X_test, y_train, y_test = train_test_split(X_w2v, labels, test_size=0.2, random_state=42)

svm_classifier = SVC(kernel='linear', probability=True)
random_forest_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
svm_classifier.fit(X_train, y_train)
random_forest_classifier.fit(X_train, y_train)

y_pred_svm = svm_classifier.predict(X_test)
y_pred_rf = random_forest_classifier.predict(X_test)

def evaluate_model(y_test, y_pred, model_name):
    print(f"{model_name} Classification Report:")
    print(classification_report(y_test, y_pred))
    conf_matrix = confusion_matrix(y_test, y_pred)
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'{model_name} Confusion Matrix')
    plt.show()

evaluate_model(y_test, y_pred_svm, "SVM (Word2Vec)")
evaluate_model(y_test, y_pred_rf, "Random Forest (Word2Vec)")

### 2. GloVe + Classifiers (SVC & RF) ###
nlp = spacy.load("en_core_web_md")  # Pre-trained GloVe vectors

def get_glove_features(text):
    return nlp(text).vector

X_glove = np.array([get_glove_features(text) for text in texts])
X_train_glove, X_test_glove, y_train_glove, y_test_glove = train_test_split(X_glove, labels, test_size=0.2, random_state=42)

svm_classifier_glove = SVC(kernel='linear', probability=True)
random_forest_classifier_glove = RandomForestClassifier(n_estimators=100, random_state=42)
svm_classifier_glove.fit(X_train_glove, y_train_glove)
random_forest_classifier_glove.fit(X_train_glove, y_train_glove)

y_pred_svm_glove = svm_classifier_glove.predict(X_test_glove)
y_pred_rf_glove = random_forest_classifier_glove.predict(X_test_glove)

evaluate_model(y_test_glove, y_pred_svm_glove, "SVM (GloVe)")
evaluate_model(y_test_glove, y_pred_rf_glove, "Random Forest (GloVe)")

### 3. Extracting POS Tag Variations ###
def extract_possible_tags(pos_tagged_tokens):
    tag_variants = defaultdict(set)
    for word, tag in pos_tagged_tokens:
        tag_variants[word].add(tag)
    return tag_variants

correctly_classified_indices = np.where(y_pred_svm == y_test)[0]
misclassified_indices = np.where(y_pred_svm != y_test)[0]

correctly_classified_tags = [extract_possible_tags(pos_tagged_texts[i]) for i in correctly_classified_indices]
misclassified_tags = [extract_possible_tags(pos_tagged_texts[i]) for i in misclassified_indices]

word_tag_dist = ConditionalFreqDist()
for tag_set in correctly_classified_tags:
    for word, tags in tag_set.items():
        for tag in tags:
            word_tag_dist[word][tag] += 1

misclassified_word_tag_dist = ConditionalFreqDist()
for tag_set in misclassified_tags:
    for word, tags in tag_set.items():
        for tag in tags:
            misclassified_word_tag_dist[word][tag] += 1

### 4. Compare Probability Distributions of POS Tag Variations ###
def plot_tag_variation_distribution(word_tag_dist, title):
    plt.figure(figsize=(12,6))
    words, variations = zip(*[(word, len(tags)) for word, tags in word_tag_dist.items() if len(tags) > 1])
    plt.barh(words[:20], variations[:20])
    plt.xlabel("Number of POS Tag Variations")
    plt.ylabel("Words")
    plt.title(title)
    plt.gca().invert_yaxis()
    plt.show()

plot_tag_variation_distribution(word_tag_dist, "Correctly Classified: POS Tag Variations")
plot_tag_variation_distribution(misclassified_word_tag_dist, "Misclassified: POS Tag Variations")

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE

# Collect data for visualization
correct_glove = []
wrong_glove = []
correct_w2v = []
wrong_w2v = []
true_labels = []

for text, _, glove_features in correctly_classified_sentences.get("SVM (GloVe)", []):
    correct_glove.append(glove_features)
    true_labels.append(data.loc[data["Translated"] == text, "Label"].values[0])
for text, _, glove_features in misclassified_sentences.get("SVM (GloVe)", []):
    wrong_glove.append(glove_features)
    true_labels.append(data.loc[data["Translated"] == text, "Label"].values[0])
for text, _, w2v_features in correctly_classified_sentences.get("SVM (Word2Vec)", []):
    correct_w2v.append(w2v_features)
    true_labels.append(data.loc[data["Translated"] == text, "Label"].values[0])
for text, _, w2v_features in misclassified_sentences.get("SVM (Word2Vec)", []):
    wrong_w2v.append(w2v_features)
    true_labels.append(data.loc[data["Translated"] == text, "Label"].values[0])

# Convert lists to arrays
label_map = {
    "Correct GloVe": correct_glove,
    "Wrong GloVe": wrong_glove,
    "Correct Word2Vec": correct_w2v,
    "Wrong Word2Vec": wrong_w2v
}

# Filter out empty arrays before concatenating
all_features = []
labels = []
all_true_labels = []  # Store corresponding labels

for label, array in label_map.items():
    array = np.array(array)
    if array.shape[0] > 0:
        all_features.append(array)
        labels.extend([label] * len(array))
        all_true_labels.extend(true_labels[:len(array)])
        true_labels = true_labels[len(array):]  # Shift to match remaining data

# Ensure all_features is not empty before applying t-SNE
if all_features:
    all_features = np.vstack(all_features)
    
    # Apply t-SNE
    tsne = TSNE(n_components=2, random_state=42)
    reduced_features = tsne.fit_transform(all_features)

    # Define colors for each embedding type
    palette = {
        "Correct GloVe": "green",
        "Wrong GloVe": "red",
        "Correct Word2Vec": "blue",
        "Wrong Word2Vec": "purple"
    }

    # Define marker styles based on label classes
    unique_labels = list(set(all_true_labels))
    markers = ['o', 's', '^', 'D', 'v', 'P', '*', 'X', '<', '>']  # Assign unique markers
    marker_map = {label: markers[i % len(markers)] for i, label in enumerate(unique_labels)}

    # Plot
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x=reduced_features[:, 0], y=reduced_features[:, 1], hue=labels, palette=palette, style=all_true_labels, markers=marker_map)
    
    plt.title("Comparison of Word Embeddings with True Labels")
    plt.xlabel("t-SNE Component 1")
    plt.ylabel("t-SNE Component 2")
    plt.legend(title="Embedding Type & True Labels", bbox_to_anchor=(1.05, 1), loc='upper left')
    
    plt.show()
else:
    print("No data available for visualization.")


## Mutlivariate Bayesian

In [None]:
from collections import defaultdict

# Prepare data grouped by POS tag
pos_tagged_vectors = defaultdict(list)

for tokens, pos_tags in zip(tokenized_texts, pos_tagged_texts):
    for word, (original_word, pos_tag_label) in zip(tokens, pos_tags):
        if word in word2vec_model.wv:
            vector = word2vec_model.wv[word]
            pos_tagged_vectors[pos_tag_label].append(vector)


In [None]:
from scipy.stats import multivariate_normal

gaussian_models = {}

for pos_tag_label, vectors in pos_tagged_vectors.items():
    vectors = np.array(vectors)
    
    if len(vectors) < 2:
        # Not enough samples for covariance, handle gracefully
        print(f"Skipping POS tag '{pos_tag_label}' due to insufficient data ({len(vectors)} sample).")
        continue
    
    mean_vector = np.mean(vectors, axis=0)
    cov_matrix = np.cov(vectors, rowvar=False)
    
    # Fix for numerical stability
    if cov_matrix.ndim == 0:  # In case it's scalar
        cov_matrix = np.eye(vectors.shape[1]) * 1e-6
    else:
        cov_matrix += np.eye(cov_matrix.shape[0]) * 1e-6
    
    gaussian_models[pos_tag_label] = multivariate_normal(mean=mean_vector, cov=cov_matrix)


In [None]:
if len(vectors) < 2:
    mean_vector = vectors[0]
    cov_matrix = np.eye(len(mean_vector)) * 1e-6


## Multivariate try again

In [None]:
from collections import defaultdict
import numpy as np

# Collect embeddings for each POS tag
pos_embeddings = defaultdict(list)

for sentence in pos_tagged_texts:
    for word, pos_tag in sentence:
        embedding = get_word2vec_embedding(word)
        pos_embeddings[pos_tag].append(embedding)

# Convert lists to arrays
for pos_tag in pos_embeddings:
    pos_embeddings[pos_tag] = np.array(pos_embeddings[pos_tag])


In [None]:
from scipy.stats import multivariate_normal

gaussian_models = {}

for pos_tag, embeddings in pos_embeddings.items():
    if len(embeddings) < 2:
        print(f"Skipping POS tag '{pos_tag}' due to insufficient data ({len(embeddings)} sample(s)).")
        continue
    
    mean_vector = np.mean(embeddings, axis=0)
    cov_matrix = np.cov(embeddings, rowvar=False)

    # Handle singular covariance matrices
    if cov_matrix.ndim == 0 or cov_matrix.shape[0] == 0:
        print(f"Skipping POS tag '{pos_tag}' due to invalid covariance matrix.")
        continue
    cov_matrix += np.eye(cov_matrix.shape[0]) * 1e-6

    gaussian_models[pos_tag] = multivariate_normal(mean=mean_vector, cov=cov_matrix)

print(f"Trained {len(gaussian_models)} POS tag Gaussian models.")


In [None]:
def sentence_log_probability(pos_tagged_sentence, gaussian_models):
    total_log_prob = 0

    for word, pos_tag in pos_tagged_sentence:
        embedding = get_word2vec_embedding(word)
        
        if pos_tag in gaussian_models:
            prob = gaussian_models[pos_tag].pdf(embedding)
            if prob > 0:
                total_log_prob += np.log(prob)
            else:
                total_log_prob += -1e6  # Penalize zero probabilities
        else:
            total_log_prob += -1e6  # Penalize missing POS tags
    
    return total_log_prob



In [None]:
from tqdm import tqdm


sentence_log_probs = []

for sentence in tqdm(pos_tagged_texts):
    log_prob = sentence_log_probability(sentence, gaussian_models)
    sentence_log_probs.append(log_prob)

data['Sentence_Log_Probability'] = sentence_log_probs


In [None]:
# Top probable sentences
print(data[['Translated', 'Sentence_Log_Probability']].sort_values(by='Sentence_Log_Probability', ascending=False).head())

# Least probable sentences
print(data[['Translated', 'Sentence_Log_Probability']].sort_values(by='Sentence_Log_Probability').head())


## Bayesian and Prior Probabilities

In [None]:


import numpy as np
from collections import defaultdict, Counter
from tqdm import tqdm
from scipy.stats import multivariate_normal

# 1️⃣ Compute prior probabilities of POS tags
all_pos_tags = [tag for sent in pos_tagged_texts for (_, tag) in sent]
total_tags = len(all_pos_tags)
tag_counts = Counter(all_pos_tags)
pos_tag_priors = {tag: count / total_tags for tag, count in tag_counts.items()}

# 2️⃣ Train Multivariate Gaussians for each POS tag
pos_tag_embeddings = defaultdict(list)

for sentence in tqdm(pos_tagged_texts):
    for word, tag in sentence:
        if word in word2vec_model.wv:
            embedding = word2vec_model.wv[word]
            pos_tag_embeddings[tag].append(embedding)

gaussian_models = {}
min_samples = 5  # Ignore tags with too few samples

for tag, embeddings in pos_tag_embeddings.items():
    if len(embeddings) < min_samples:
        print(f"Skipping POS tag '{tag}' due to insufficient data ({len(embeddings)} samples).")
        continue
    embeddings_array = np.array(embeddings)
    mean_vector = np.mean(embeddings_array, axis=0)
    cov_matrix = np.cov(embeddings_array, rowvar=False)

    cov_matrix += np.eye(cov_matrix.shape[0]) * 1e-6

    gaussian_models[tag] = multivariate_normal(mean=mean_vector, cov=cov_matrix)


# 3️⃣ Prediction function with priors
def predict_pos_tags_with_bayes(tokens):
    predicted_tags = []
    for word in tokens:
        if word in word2vec_model.wv:
            embedding = word2vec_model.wv[word]
            tag_scores = {}
            for tag, gaussian in gaussian_models.items():
                likelihood = gaussian.pdf(embedding)
                prior = pos_tag_priors.get(tag, 1e-6)  # Small value if unseen
                score = likelihood * prior
                tag_scores[tag] = score
            predicted_tag = max(tag_scores, key=tag_scores.get)
        else:
            predicted_tag = 'UNK'  # Unknown word
        predicted_tags.append((word, predicted_tag))
    return predicted_tags


In [None]:
# Take a sample sentence from your dataset
sample_sentence_from_data = pos_tagged_texts[0]  # First sentence in the dataset
sample_words = [word for word, tag in sample_sentence_from_data]

# Predict POS tags
predicted_tags = predict_pos_tags_with_bayes(sample_words)

# Show results
print("\nPredicted POS tags (with priors) for a sample sentence from the dataset:")
for word, tag in predicted_tags:
    print(f"{word} → {tag}")


In [None]:
import pandas as pd

# Store predictions for all sentences
all_predictions = []

# Loop through every sentence in your dataset
for sentence in tqdm(pos_tagged_texts):
    words = [word for word, _ in sentence]
    predicted_tags = predict_pos_tags_with_bayes(words)
    
    all_predictions.append({
        "sentence": " ".join(words),
        "predicted_tags": " ".join([f"{word}/{tag}" for word, tag in predicted_tags])
    })

# Convert to DataFrame for easy viewing and saving
predictions_df = pd.DataFrame(all_predictions)

# Display first few predictions
print(predictions_df.head())

# ✅ Save to CSV
predictions_df.to_csv("pos_tag_predictions.csv", index=False)
print("Predictions saved to pos_tag_predictions.csv")
