<a href="https://colab.research.google.com/github/Nirika-Lamichhane/Minor_Project-5-24-25-36-/blob/main/dev_embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Mount Google Drive automatically
from google.colab import drive
drive.mount('/content/drive', force_remount=False)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Imports and dataset loading
import pandas as pd
import numpy as np
!pip install gensim
import gensim
# Path to my dataset in Drive
dataset_path = "/content/drive/MyDrive/data_4000.txt"

# Loading dataset and adding headers
df = pd.read_csv(dataset_path, names=["comment","generalized_target","aspect","sentiment"])

print("Dataset loaded with shape:", df.shape)
print(df.head())

Dataset loaded with shape: (4076, 4)
                                          comment generalized_target  \
0          नागरिक अधिकार अझै सुनिश्चित गरिएको छैन              सरकार   
1          नागरिक अधिकार अझै सुनिश्चित गरिएको छैन      नागरिक अधिकार   
2         सार्वजनिक स्वास्थ्य सेवामा सुधार भएको छ    स्वास्थ्य विभाग   
3  भ्रष्टाचारको सामना गर्न कडा कदम चाल्न आवश्यक छ     भ्रष्ट अधिकारी   
4            अर्थतन्त्रमा सुधारको संकेत देखिएको छ              नेपाल   

       aspect sentiment  
0  Governance   Neutral  
1  Governance   Neutral  
2     Service  Positive  
3  Corruption  Positive  
4     Economy  Positive  


In [3]:
# Character n-gram tokenizer
def char_ngrams(text, n=3):
    """
    Generate character n-grams from a given text.
    Example: "यो भिडियो" with n=3 → ["यो ", "ो भ", " भिड", "िडि", "डियो"]
    """
    text = str(text).strip()
    return [text[i:i+n] for i in range(len(text)-n+1)]

# Quick test
sample_text = "यो भिडियो राम्रो छ"
print("Sample text:", sample_text)
print("Character 3-grams:", char_ngrams(sample_text, n=3))

Sample text: यो भिडियो राम्रो छ
Character 3-grams: ['यो ', 'ो भ', ' भि', 'भिड', 'िडि', 'डिय', 'ियो', 'यो ', 'ो र', ' रा', 'राम', 'ाम्', 'म्र', '्रो', 'रो ', 'ो छ']


In [4]:
# Loading FastText embeddings

# Path to my FastText Nepali embeddings in Drive
fasttext_path = "/content/drive/MyDrive/cc.ne.300.vec.gz"

# Loading the embeddings (Word2Vec format)
ft_model = gensim.models.KeyedVectors.load_word2vec_format(fasttext_path)

# Quick checks
print("Embedding dimension:", ft_model.vector_size)
print("Example vector for 'नेपाल':", ft_model['नेपाल'][:10])  # show first 10 values

Embedding dimension: 300
Example vector for 'नेपाल': [ 0.0624  0.0787 -0.0008 -0.0082  0.0259  0.0453  0.0057 -0.0182 -0.0275
 -0.0363]


In [5]:
# Converting comments into padded embedding matrices

def embed_comment(comment, n=3, max_len=50):
    """
    Convert a single comment into a fixed-length embedding matrix.
    - Tokenize into character n-grams (default n=3).
    - Map each n-gram to a FastText vector (300-dim).
    - Pad or truncate to max_len tokens.
    """
    ngrams = char_ngrams(comment, n)
    vectors = []
    for ng in ngrams:
        if ng in ft_model:
            vectors.append(ft_model[ng])
        else:
            vectors.append(np.zeros(ft_model.vector_size))
    # Pad / truncate
    if len(vectors) < max_len:
        pad = [np.zeros(ft_model.vector_size)] * (max_len - len(vectors))
        vectors.extend(pad)
    else:
        vectors = vectors[:max_len]
    return np.array(vectors)

# Building dataset embeddings
X = np.stack([embed_comment(c) for c in df['comment']])

# Converting sentiment labels to numeric
y = df['sentiment'].map({"positive":0,"neutral":1,"negative":2}).values

print("Embeddings shape:", X.shape)   # (num_samples, max_len, 300)
print("Labels shape:", y.shape)

Embeddings shape: (4076, 50, 300)
Labels shape: (4076,)


In [6]:
# Encoding aspect labels (categorical → numeric)
aspect_map = {a:i for i,a in enumerate(df['aspect'].unique())}
y_aspect = df['aspect'].map(aspect_map).values

print("Aspect categories:", aspect_map)
print("Aspect labels shape:", y_aspect.shape)

Aspect categories: {'Governance': 0, 'Service': 1, 'Corruption': 2, 'Economy': 3, 'Policy': 4, 'Positive': 5, 'governance': 6, 'corruption': 7, 'economy': 8, 'service': 9, 'policy': 10}
Aspect labels shape: (4076,)


In [7]:
# Generating BIO labels for targets

# Defining BIO tag set
bio_tags = {"O":0, "B-TARGET":1, "I-TARGET":2}

def generate_bio(comment, target, n=3, max_len=50):
    """
    Generate BIO labels for each n-gram token in a comment.
    - If token overlaps with target string → B/I-TARGET
    - Else → O
    """
    ngrams = char_ngrams(comment, n)
    labels = []
    for ng in ngrams:
        if target and ng in target:
            # First occurrence → B-TARGET, subsequent → I-TARGET
            if not labels or labels[-1] == bio_tags["O"]:
                labels.append(bio_tags["B-TARGET"])
            else:
                labels.append(bio_tags["I-TARGET"])
        else:
            labels.append(bio_tags["O"])
    # Padding / truncation
    if len(labels) < max_len:
        labels.extend([bio_tags["O"]] * (max_len - len(labels)))
    else:
        labels = labels[:max_len]
    return np.array(labels)

# Building BIO label dataset
y_bio = np.stack([generate_bio(c, t) for c,t in zip(df['comment'], df['generalized_target'])])

print("BIO labels shape:", y_bio.shape)   # (num_samples, max_len)

BIO labels shape: (4076, 50)


In [8]:
# Saving all outputs to Drive for use in another Module

# Save embeddings and labels
np.save("/content/drive/MyDrive/X.npy", X)                # embeddings
np.save("/content/drive/MyDrive/y_sentiment.npy", y)      # sentiment labels
np.save("/content/drive/MyDrive/y_aspect.npy", y_aspect)  # aspect labels
np.save("/content/drive/MyDrive/y_bio.npy", y_bio)        # BIO labels

print("✅ All outputs saved to Drive!")
print("Files created:")
print("- /content/drive/MyDrive/X.npy")
print("- /content/drive/MyDrive/y_sentiment.npy")
print("- /content/drive/MyDrive/y_aspect.npy")
print("- /content/drive/MyDrive/y_bio.npy")

✅ All outputs saved to Drive!
Files created:
- /content/drive/MyDrive/X.npy
- /content/drive/MyDrive/y_sentiment.npy
- /content/drive/MyDrive/y_aspect.npy
- /content/drive/MyDrive/y_bio.npy
