<a href="https://colab.research.google.com/github/Nirika-Lamichhane/Minor_Project-5-24-25-36-/blob/main/dev_embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#imports
from google.colab import drive
import pandas as pd
!pip install gensim
import gensim
import numpy as np

In [None]:
# Mount Google Drive and Load dataset
drive.mount('/content/drive')
dataset_path = '/content/drive/MyDrive/data_6000.txt'
df = pd.read_csv(dataset_path, header=None, names=["comment","target","aspect","sentiment"])
print("Original Dataset:")
print(df.head())

In [None]:
# Drop target column
df = df.drop(columns=["target"])

print("Dataset after dropping target column:")
print(df.head())

In [None]:
# Character n-gram tokenizer
def char_ngrams(text, n=3):
    """
    Generate character n-grams from a given text.
    Example: "यो भिडियो" with n=3 → ["यो ", "ो भ", " भिड", "िडि", "डियो"]
    """
    text = str(text).strip()
    return [text[i:i+n] for i in range(len(text)-n+1)]

# Applying tokenizer to dataset
df['char_ngrams'] = df['comment'].apply(lambda x: char_ngrams(x, n=3))

print("Tokenized sample:")
print(df[['comment','char_ngrams']].head())

In [None]:
# Path to FastText embeddings in Drive
fasttext_path = '/content/drive/MyDrive/cc.ne.300.vec.gz'
fasttext_model = gensim.models.KeyedVectors.load_word2vec_format(fasttext_path)

print("FastText model loaded with vocab size:", len(fasttext_model.key_to_index))

In [10]:
# Function to embed a single comment
def embed_comment(comment, n=3, max_len=50):
    """
    Convert a single comment into a fixed-length embedding matrix.
    - Tokenize into character n-grams (default n=3).
    - Map each n-gram to a FastText vector (300-dim).
    - Pad or truncate to max_len tokens.
    """
    ngrams = char_ngrams(comment, n)
    vectors = []
    for ng in ngrams:
        if ng in fasttext_model.key_to_index:
            vectors.append(fasttext_model[ng])
        else:
            vectors.append(np.zeros(fasttext_model.vector_size))
    # Pad / truncate
    if len(vectors) < max_len:
        pad = [np.zeros(fasttext_model.vector_size)] * (max_len - len(vectors))
        vectors.extend(pad)
    else:
        vectors = vectors[:max_len]
    return np.array(vectors)

# Build dataset embeddings
X = np.stack([embed_comment(c) for c in df['comment']])

print("Embeddings shape:", X.shape)   # (num_samples, max_len, 300)

Embeddings shape: (6125, 50, 300)


In [11]:
# Sentiment encoding
sentiment_map = {"positive":0, "neutral":1, "negative":2}
y_sentiment = df['sentiment'].map(sentiment_map).values

print("Unique sentiment labels:", np.unique(y_sentiment))

Unique sentiment labels: [ 0.  1.  2. nan]


In [12]:
# Define fixed aspect mapping
aspect_map = {
    "policy":0,
    "governance":1,
    "service":2,
    "economy":3,
    "corruption":4
}

y_aspect = df['aspect'].map(aspect_map).values

print("Unique aspect labels:", np.unique(y_aspect))

Unique aspect labels: [ 0.  1.  2.  3.  4. nan]


In [13]:
print("Embeddings shape:", X.shape)       # (num_samples, max_len, 300)
print("Sentiment labels shape:", y_sentiment.shape)  # (num_samples,)
print("Aspect labels shape:", y_aspect.shape)        # (num_samples,)

Embeddings shape: (6125, 50, 300)
Sentiment labels shape: (6125,)
Aspect labels shape: (6125,)
