<a href="https://colab.research.google.com/github/Nirika-Lamichhane/Minor_Project-5-24-25-36-/blob/main/training_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Core imports
import re
import numpy as np
import pandas as pd
import gensim

# Preprocessing
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# Deep learning
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Dropout, Bidirectional, LSTM, Dense, Input, Concatenate
from tensorflow.keras.models import Model

In [5]:
from google.colab import drive
drive.mount('/content/drive')

# Load dataset with manual headers
data_path = "/content/drive/MyDrive/dataset.txt"
df = pd.read_csv(data_path, header=None, names=["comment","target","aspect","sentiment"])

print(df.head())

Mounted at /content/drive
                                comment          target      aspect sentiment
0  नेपाल आमा जय सनातन हिन्दू राष्ट्र जय  हिन्दू राष्ट्र      policy  positive
1                 दुर्गा प्रसाईं चोर हो   राजनीतिक नेता  corruption  negative
2                   राजतन्त्र जिन्दाबाद       राजतन्त्र  governance  positive
3           राजा ल्याउन जनजागरण भएको हो       राजतन्त्र  governance  positive
4   आन्दोलन जारी छ नालायक सरकार चाहिदैन           सरकार  governance  negative


In [6]:
# Clean Nepali text
def clean_text(text):
    text = re.sub(r'[^\u0900-\u097F\s]', '', text)  # keep Nepali chars + spaces
    return text.strip()

df['comment'] = df['comment'].apply(clean_text)

# Character-level tokenizer
tokenizer = Tokenizer(char_level=True, lower=False)
tokenizer.fit_on_texts(df['comment'])

sequences = tokenizer.texts_to_sequences(df['comment'])
max_len = max(len(seq) for seq in sequences)
X = pad_sequences(sequences, maxlen=max_len, padding='post')

print("Vocabulary size:", len(tokenizer.word_index))
print("Max sequence length:", max_len)

Vocabulary size: 67
Max sequence length: 129


In [7]:
#Load FastText Embeddings
fasttext_path = "/content/drive/MyDrive/cc.ne.300.vec.gz"
ft_model = gensim.models.KeyedVectors.load_word2vec_format(fasttext_path)

embedding_dim = 300
vocab_size = len(tokenizer.word_index) + 1

embedding_matrix = np.zeros((vocab_size, embedding_dim))
for char, i in tokenizer.word_index.items():
    if char in ft_model:
        embedding_matrix[i] = ft_model[char]

In [8]:
#Encode Labels (Target, Aspect, Sentiment)
target_encoder = LabelEncoder()
aspect_encoder = LabelEncoder()
sentiment_encoder = LabelEncoder()

y_target = to_categorical(target_encoder.fit_transform(df['target']))
y_aspect = to_categorical(aspect_encoder.fit_transform(df['aspect']))
y_sentiment = to_categorical(sentiment_encoder.fit_transform(df['sentiment']))

print("Targets:", target_encoder.classes_)
print("Aspects:", aspect_encoder.classes_)
print("Sentiments:", sentiment_encoder.classes_)

Targets: ['खुला सिमाना' 'गणतन्त्र' 'ठेकेदार' 'धार्मिक संस्था' 'न्यायालय' 'प्रहरी'
 'बाह्र बुँदे समझदारी' 'मधेशी समुदाय' 'माओवादी जनयुद्ध' 'मिडिया संस्था'
 'राजतन्त्र' 'राजनीतिक दल' 'राजनीतिक नेता' 'राजनीतिक प्रणाली'
 'राजनीतिक विचार' 'राजनीतिक विश्लेषण' 'विदेशी शक्ति' 'शिक्षा निकाय'
 'संविधान २०४७' 'सरकार' 'सरकारी निकाय' 'सुरक्षा निकाय' 'हिन्दू राष्ट्र'
 '२०६२ २०६३ आन्दोलन']
Aspects: ['corruption' 'economy' 'governance' 'policy' 'service']
Sentiments: ['negative' 'neutral' 'positive']


In [10]:
# Build CNN–BiLSTM Hybrid Model (with padding='same')

inp = Input(shape=(max_len,))

# Embedding layer (FastText, frozen)
emb = Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], trainable=False)(inp)

# CNN branches with different kernel sizes, using padding='same'
conv2 = Conv1D(filters=100, kernel_size=2, activation='relu', padding='same')(emb)
conv3 = Conv1D(filters=100, kernel_size=3, activation='relu', padding='same')(emb)
conv4 = Conv1D(filters=100, kernel_size=4, activation='relu', padding='same')(emb)

# Max pooling for each branch
pool2 = MaxPooling1D(pool_size=2)(conv2)
pool3 = MaxPooling1D(pool_size=2)(conv3)
pool4 = MaxPooling1D(pool_size=2)(conv4)

# Concatenate pooled feature maps
cnn_out = Concatenate()([pool2, pool3, pool4])
cnn_out = Dropout(0.4)(cnn_out)

# BiLSTM layer to capture sequential dependencies
bilstm = Bidirectional(LSTM(128))(cnn_out)

# Dense shared representation
dense = Dense(128, activation='relu')(bilstm)

# Multi-output heads for target, aspect, sentiment
out_target = Dense(y_target.shape[1], activation='softmax', name="target")(dense)
out_aspect = Dense(y_aspect.shape[1], activation='softmax', name="aspect")(dense)
out_sentiment = Dense(y_sentiment.shape[1], activation='softmax', name="sentiment")(dense)

# Build and compile model
model = Model(inputs=inp, outputs=[out_target, out_aspect, out_sentiment])
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Show summary
model.summary()

In [12]:
# Compile model with explicit dict for losses and metrics
model.compile(
    optimizer='adam',
    loss={
        "target": "categorical_crossentropy",
        "aspect": "categorical_crossentropy",
        "sentiment": "categorical_crossentropy"
    },
    metrics={
        "target": ["accuracy"],
        "aspect": ["accuracy"],
        "sentiment": ["accuracy"]
    }
)

# Train the model
history = model.fit(
    X,
    {
        "target": y_target,
        "aspect": y_aspect,
        "sentiment": y_sentiment
    },
    epochs=10,
    batch_size=64,
    validation_split=0.1
)

Epoch 1/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 966ms/step - aspect_accuracy: 0.3171 - aspect_loss: 1.5671 - loss: 5.7932 - sentiment_accuracy: 0.3993 - sentiment_loss: 1.0804 - target_accuracy: 0.0899 - target_loss: 3.1381 - val_aspect_accuracy: 0.2500 - val_aspect_loss: 1.6076 - val_loss: 5.4253 - val_sentiment_accuracy: 0.3500 - val_sentiment_loss: 1.0471 - val_target_accuracy: 0.3500 - val_target_loss: 2.7706
Epoch 2/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 531ms/step - aspect_accuracy: 0.3763 - aspect_loss: 1.4961 - loss: 5.3484 - sentiment_accuracy: 0.4868 - sentiment_loss: 1.0389 - target_accuracy: 0.2823 - target_loss: 2.8007 - val_aspect_accuracy: 0.2500 - val_aspect_loss: 1.5785 - val_loss: 4.9997 - val_sentiment_accuracy: 0.3500 - val_sentiment_loss: 1.0583 - val_target_accuracy: 0.3500 - val_target_loss: 2.3629
Epoch 3/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1s/step - aspect_accuracy: 0.3763 - aspect_

In [13]:
def predict_comment(comment):
    seq = tokenizer.texts_to_sequences([clean_text(comment)])
    seq = pad_sequences(seq, maxlen=max_len, padding='post')
    pred_target, pred_aspect, pred_sentiment = model.predict(seq)

    target = target_encoder.inverse_transform([pred_target.argmax(axis=1)[0]])[0]
    aspect = aspect_encoder.inverse_transform([pred_aspect.argmax(axis=1)[0]])[0]
    sentiment = sentiment_encoder.inverse_transform([pred_sentiment.argmax(axis=1)[0]])[0]

    return {"Target": target, "Aspect": aspect, "Sentiment": sentiment}

# Example
print(predict_comment("सरकारले शिक्षा क्षेत्रमा सुधार ल्याउनु पर्छ"))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 706ms/step
{'Target': 'राजनीतिक दल', 'Aspect': 'policy', 'Sentiment': 'negative'}
