<a href="https://colab.research.google.com/github/Nirika-Lamichhane/Minor_Project-5-24-25-36-/blob/main/training_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install gensim
# Imports
from google.colab import drive
import pandas as pd
import gensim
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, Bidirectional, LSTM, Dropout, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns



In [63]:
# Mount Google Drive and Load dataset
drive.mount('/content/drive')
dataset_path = '/content/drive/MyDrive/data_6000.txt'
df = pd.read_csv(dataset_path, header=None, names=["comment","target","aspect","sentiment"])
print("Original Dataset:")
print(df.head())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Original Dataset:
                                             comment         target  \
0  अब एकमात्र अबको प्रधानमन्त्री हर्क सम्पाङ सबै ...  राजनीतिक नेता   
1  सिंहदरबार जलेको दृश्य देख्दा धेरै पीडा हुन्छ अ...  सरकारी संस्था   
2  राजनीतिक दलको इच्छा बुझ्ने प्रयास कहिल्यै भएको...    राजनीतिक दल   
3  दश वर्षे जनयुद्ध देशमा पैसा लुट्न गरिएको जस्तो...    राजनीतिक दल   
4             कति राम्रो कुरा गर्नुभएको गज्जब लाग्यो         मिडिया   

       aspect sentiment  
0  corruption  positive  
1  governance  negative  
2  governance  negative  
3  corruption  negative  
4     service  positive  


In [64]:
# Drop target column
df = df.drop(columns=["target"])

print("Dataset after dropping target column:")
print(df.head())


Dataset after dropping target column:
                                             comment      aspect sentiment
0  अब एकमात्र अबको प्रधानमन्त्री हर्क सम्पाङ सबै ...  corruption  positive
1  सिंहदरबार जलेको दृश्य देख्दा धेरै पीडा हुन्छ अ...  governance  negative
2  राजनीतिक दलको इच्छा बुझ्ने प्रयास कहिल्यै भएको...  governance  negative
3  दश वर्षे जनयुद्ध देशमा पैसा लुट्न गरिएको जस्तो...  corruption  negative
4             कति राम्रो कुरा गर्नुभएको गज्जब लाग्यो     service  positive


In [65]:
# Character n-gram tokenizer
def char_ngrams(text, n=3):
    """
    Generate character n-grams from a given text.
    Example: "यो भिडियो" with n=3 → ["यो ", "ो भ", " भिड", "िडि", "डियो"]
    """
    text = str(text).strip()
    return [text[i:i+n] for i in range(len(text)-n+1)]

# Applying tokenizer to dataset
df['char_ngrams'] = df['comment'].apply(lambda x: char_ngrams(x, n=3))

print("Tokenized sample:")
print(df[['comment','char_ngrams']].head())

Tokenized sample:
                                             comment  \
0  अब एकमात्र अबको प्रधानमन्त्री हर्क सम्पाङ सबै ...   
1  सिंहदरबार जलेको दृश्य देख्दा धेरै पीडा हुन्छ अ...   
2  राजनीतिक दलको इच्छा बुझ्ने प्रयास कहिल्यै भएको...   
3  दश वर्षे जनयुद्ध देशमा पैसा लुट्न गरिएको जस्तो...   
4             कति राम्रो कुरा गर्नुभएको गज्जब लाग्यो   

                                         char_ngrams  
0  [अब , ब ए,  एक, एकम, कमा, मात, ात्, त्र, ्र , ...  
1  [सिं, िंह, ंहद, हदर, दरब, रबा, बार, ार , र ज, ...  
2  [राज, ाजन, जनी, नीत, ीति, तिक, िक , क द,  दल, ...  
3  [दश , श व,  वर, वर्, र्ष, ्षे, षे , े ज,  जन, ...  
4  [कति, ति , ि र,  रा, राम, ाम्, म्र, ्रो, रो , ...  


In [6]:
# Path to FastText embeddings in Drive
fasttext_path = '/content/drive/MyDrive/cc.ne.300.vec.gz'
fasttext_model = gensim.models.KeyedVectors.load_word2vec_format(fasttext_path)

print("FastText model loaded with vocab size:", len(fasttext_model.key_to_index))

FastText model loaded with vocab size: 576768


In [66]:
# Function to embed a single comment
def embed_comment(comment, n=3, max_len=50):
    """
    Convert a single comment into a fixed-length embedding matrix.
    - Tokenize into character n-grams (default n=3).
    - Map each n-gram to a FastText vector (300-dim).
    - Pad or truncate to max_len tokens.
    """
    ngrams = char_ngrams(comment, n)
    vectors = []
    for ng in ngrams:
        if ng in fasttext_model.key_to_index:
            vectors.append(fasttext_model[ng])
        else:
            vectors.append(np.zeros(fasttext_model.vector_size))
    # Pad / truncate
    if len(vectors) < max_len:
        pad = [np.zeros(fasttext_model.vector_size)] * (max_len - len(vectors))
        vectors.extend(pad)
    else:
        vectors = vectors[:max_len]
    return np.array(vectors)

# Build dataset embeddings
X = np.stack([embed_comment(c) for c in df['comment']])

print("Embeddings shape:", X.shape)   # (num_samples, max_len, 300)

Embeddings shape: (6125, 50, 300)


In [67]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

# Define valid sets
valid_aspects = ["policy", "governance", "service", "economy", "corruption"]
valid_sentiments = ["positive", "neutral", "negative"]

# Filter dataset to keep only valid rows
df = df[df['aspect'].isin(valid_aspects)]
df = df[df['sentiment'].isin(valid_sentiments)]

# Initialize encoders
aspect_encoder = LabelEncoder()
sentiment_encoder = LabelEncoder()

# Fit and transform labels
y_aspect_int = aspect_encoder.fit_transform(df['aspect'])
y_sentiment_int = sentiment_encoder.fit_transform(df['sentiment'])

# Convert to one-hot
y_aspect = to_categorical(y_aspect_int, num_classes=len(valid_aspects))
y_sentiment = to_categorical(y_sentiment_int, num_classes=len(valid_sentiments))

X = np.stack([embed_comment(c) for c in df['comment']])

# Print classes and shapes to confirm
print("Aspect classes:", aspect_encoder.classes_)
print("Sentiment classes:", sentiment_encoder.classes_)
print("Aspect labels shape:", y_aspect.shape)
print("Sentiment labels shape:", y_sentiment.shape)

Aspect classes: ['corruption' 'economy' 'governance' 'policy' 'service']
Sentiment classes: ['negative' 'neutral' 'positive']
Aspect labels shape: (6066, 5)
Sentiment labels shape: (6066, 3)


In [68]:
print("Embeddings shape:", X.shape)       # (num_samples, max_len, 300)
print("Sentiment labels shape:", y_sentiment.shape)  # (num_samples,)
print("Aspect labels shape:", y_aspect.shape)        # (num_samples,)

Embeddings shape: (6066, 50, 300)
Sentiment labels shape: (6066, 3)
Aspect labels shape: (6066, 5)


In [69]:
from sklearn.model_selection import train_test_split

# Split into train and test sets
X_train, X_test, y_sent_train, y_sent_test, y_aspect_train, y_aspect_test = train_test_split(
    X, y_sentiment, y_aspect, test_size=0.2, random_state=42
)

print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)
print("Sentiment train labels shape:", y_sent_train.shape)
print("Sentiment test labels shape:", y_sent_test.shape)
print("Aspect train labels shape:", y_aspect_train.shape)
print("Aspect test labels shape:", y_aspect_test.shape)

Training set shape: (4852, 50, 300)
Test set shape: (1214, 50, 300)
Sentiment train labels shape: (4852, 3)
Sentiment test labels shape: (1214, 3)
Aspect train labels shape: (4852, 5)
Aspect test labels shape: (1214, 5)


In [70]:
# Define CNN-BiLSTM Hybrid Model
input_layer = Input(shape=(50, 300))

# CNN layer
conv = Conv1D(filters=128, kernel_size=3, activation='relu')(input_layer)
pool = MaxPooling1D(pool_size=2)(conv)

# BiLSTM layer
bilstm = Bidirectional(LSTM(128))(pool)

# Dropout
drop = Dropout(0.5)(bilstm)

# Dense layer
dense = Dense(64, activation='relu')(drop)

# Output heads
sentiment_output = Dense(3, activation='softmax', name="sentiment")(dense)
aspect_output = Dense(5, activation='softmax', name="aspect")(dense)

# Build model
model = Model(inputs=input_layer, outputs=[sentiment_output, aspect_output])

# Compile model
model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss={"sentiment": "categorical_crossentropy", "aspect": "categorical_crossentropy"},
    metrics={"sentiment": "accuracy", "aspect": "accuracy"}
)

model.summary()

In [72]:
history = model.fit(
    X_train,
    {"sentiment": y_sent_train, "aspect": y_aspect_train},
    validation_data=(X_test, {"sentiment": y_sent_test, "aspect": y_aspect_test}),
    epochs=10,
    batch_size=32
)

Epoch 1/10
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 140ms/step - aspect_accuracy: 0.3416 - aspect_loss: 1.5195 - loss: 2.5593 - sentiment_accuracy: 0.4376 - sentiment_loss: 1.0398 - val_aspect_accuracy: 0.4629 - val_aspect_loss: 1.3572 - val_loss: 2.2771 - val_sentiment_accuracy: 0.5890 - val_sentiment_loss: 0.9198
Epoch 2/10
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 125ms/step - aspect_accuracy: 0.4603 - aspect_loss: 1.2797 - loss: 2.1455 - sentiment_accuracy: 0.6146 - sentiment_loss: 0.8658 - val_aspect_accuracy: 0.5461 - val_aspect_loss: 1.1150 - val_loss: 1.9785 - val_sentiment_accuracy: 0.6112 - val_sentiment_loss: 0.8638
Epoch 3/10
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 123ms/step - aspect_accuracy: 0.5677 - aspect_loss: 1.0839 - loss: 1.8601 - sentiment_accuracy: 0.6659 - sentiment_loss: 0.7762 - val_aspect_accuracy: 0.5544 - val_aspect_loss: 1.0815 - val_loss: 1.9226 - val_sentiment_accuracy: 0.6178 

In [73]:
# Evaluate on test set
eval_results = model.evaluate(
    X_test,
    {"sentiment": y_sent_test, "aspect": y_aspect_test},
    verbose=1
)

print("\nEvaluation Results:")
print(f"Total Loss: {eval_results[0]:.4f}")
print(f"Sentiment Loss: {eval_results[1]:.4f}")
print(f"Aspect Loss: {eval_results[2]:.4f}")
print(f"Sentiment Accuracy: {eval_results[3]:.4f}")
print(f"Aspect Accuracy: {eval_results[4]:.4f}")

[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 34ms/step - aspect_accuracy: 0.6408 - aspect_loss: 1.0763 - loss: 1.8101 - sentiment_accuracy: 0.6998 - sentiment_loss: 0.7338

Evaluation Results:
Total Loss: 1.8596
Sentiment Loss: 0.7809
Aspect Loss: 1.0793
Sentiment Accuracy: 0.6211
Aspect Accuracy: 0.6771
