<a href="https://colab.research.google.com/github/Nirika-Lamichhane/Minor_Project-5-24-25-36-/blob/main/generic_noun_trained_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import re

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import (
    Input, Embedding, Conv1D, MaxPooling1D,
    Bidirectional, LSTM, Dense, Dropout, Concatenate
)
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical


In [2]:
df = pd.read_csv("/content/dummy_canonical.csv", header=None)
df.columns = ["sentence", "target", "aspect", "sentiment"]


In [3]:
df.head()

Unnamed: 0,sentence,target,aspect,sentiment
0,अब युवाहरु कृषि मा अघि बढनु पर्छ,कृषि,policy,positive
1,सरकारले कृषकको समस्या समाधान गर्न सकेको छैन,सरकार,governance,negative
2,देशको अवस्था देखेर रुन मन लाग्छ,राज्य,governance,negative
3,बिचौलियाले किसान र उपभोक्ता दुवैलाई लुटिरहेका छन्,बजार मध्यस्थ,corruption,negative
4,कृषिमा उचित नीति नभएकाले किसान मर्कामा छन्,कृषि,policy,negative


In [4]:
df.shape

(713, 4)

In [5]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)  # remove punctuation
    return text.strip()

df["sentence"] = df["sentence"].apply(clean_text)
df["target"] = df["target"].apply(clean_text)

In [6]:
aspect_encoder = LabelEncoder()
sentiment_encoder = LabelEncoder()

df["aspect_enc"] = aspect_encoder.fit_transform(df["aspect"])
df["sentiment_enc"] = sentiment_encoder.fit_transform(df["sentiment"])

num_aspects = len(aspect_encoder.classes_)
num_sentiments = len(sentiment_encoder.classes_)

In [7]:
MAX_VOCAB = 20000
MAX_LEN_SENT = 50
MAX_LEN_TGT = 5

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<OOV>")
tokenizer.fit_on_texts(
    df["sentence"].tolist() + df["target"].tolist()
)

X_sent = tokenizer.texts_to_sequences(df["sentence"])
X_tgt = tokenizer.texts_to_sequences(df["target"])

X_sent = pad_sequences(X_sent, maxlen=MAX_LEN_SENT, padding="post")
X_tgt = pad_sequences(X_tgt, maxlen=MAX_LEN_TGT, padding="post")


In [8]:
y_aspect = to_categorical(df["aspect_enc"], num_aspects)
y_sentiment = to_categorical(df["sentiment_enc"], num_sentiments)

In [9]:
X_sent_train, X_sent_test, \
X_tgt_train, X_tgt_test, \
y_aspect_train, y_aspect_test, \
y_sent_train, y_sent_test = train_test_split(
    X_sent, X_tgt, y_aspect, y_sentiment,
    test_size=0.2,
    random_state=42
)

In [10]:
VOCAB_SIZE = min(MAX_VOCAB, len(tokenizer.word_index) + 1)
EMB_DIM = 100


In [11]:
sent_input = Input(shape=(MAX_LEN_SENT,), name="sentence_input")
tgt_input = Input(shape=(MAX_LEN_TGT,), name="target_input")


In [12]:
embedding = Embedding(
    input_dim=VOCAB_SIZE,
    output_dim=EMB_DIM,
    mask_zero=True
)

sent_emb = embedding(sent_input)
tgt_emb = embedding(tgt_input)

In [13]:
cnn = Conv1D(filters=128, kernel_size=3, activation="relu")(sent_emb)
cnn = MaxPooling1D(pool_size=2)(cnn)




In [14]:
sent_lstm = Bidirectional(LSTM(64))(cnn)
tgt_lstm = Bidirectional(LSTM(32))(tgt_emb)

In [15]:
merged = Concatenate()([sent_lstm, tgt_lstm])
merged = Dropout(0.5)(merged)

In [16]:
aspect_out = Dense(64, activation="relu")(merged)
aspect_out = Dense(num_aspects, activation="softmax", name="aspect_output")(aspect_out)


In [17]:
sent_out = Dense(64, activation="relu")(merged)
sent_out = Dense(num_sentiments, activation="softmax", name="sentiment_output")(sent_out)


In [18]:
model = Model(
    inputs=[sent_input, tgt_input],
    outputs=[aspect_out, sent_out]
)

model.compile(
    optimizer="adam",
    loss={
        "aspect_output": "categorical_crossentropy",
        "sentiment_output": "categorical_crossentropy"
    },
    metrics={
        "aspect_output": "accuracy",
        "sentiment_output": "accuracy"
    }
)


In [19]:
history = model.fit(
    [X_sent_train, X_tgt_train],
    [y_aspect_train, y_sent_train],
    validation_data=(
        [X_sent_test, X_tgt_test],
        [y_aspect_test, y_sent_test]
    ),
    epochs=10,
    batch_size=32
)

Epoch 1/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 155ms/step - aspect_output_accuracy: 0.2657 - aspect_output_loss: 1.5833 - loss: 2.6046 - sentiment_output_accuracy: 0.5519 - sentiment_output_loss: 1.0211 - val_aspect_output_accuracy: 0.3217 - val_aspect_output_loss: 1.5369 - val_loss: 2.4254 - val_sentiment_output_accuracy: 0.6084 - val_sentiment_output_loss: 0.8749
Epoch 2/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 60ms/step - aspect_output_accuracy: 0.3341 - aspect_output_loss: 1.5009 - loss: 2.4031 - sentiment_output_accuracy: 0.6122 - sentiment_output_loss: 0.9024 - val_aspect_output_accuracy: 0.3217 - val_aspect_output_loss: 1.5051 - val_loss: 2.3850 - val_sentiment_output_accuracy: 0.6084 - val_sentiment_output_loss: 0.8820
Epoch 3/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 62ms/step - aspect_output_accuracy: 0.3151 - aspect_output_loss: 1.4703 - loss: 2.3483 - sentiment_output_accuracy: 0.6184 - sentimen

In [22]:
sample_sentence = "जय नेपाल"
sample_target = "राज्य"
sample_sentence = clean_text(sample_sentence)
sample_target = clean_text(sample_target)

seq_sent = tokenizer.texts_to_sequences([sample_sentence])
seq_tgt = tokenizer.texts_to_sequences([sample_target])

seq_sent = pad_sequences(seq_sent, maxlen=MAX_LEN_SENT, padding="post")
seq_tgt = pad_sequences(seq_tgt, maxlen=MAX_LEN_TGT, padding="post")

pred_aspect, pred_sentiment = model.predict([seq_sent, seq_tgt])

print("Predicted Aspect:",
      aspect_encoder.inverse_transform([np.argmax(pred_aspect)]))

print("Predicted Sentiment:",
      sentiment_encoder.inverse_transform([np.argmax(pred_sentiment)]))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
Predicted Aspect: ['governance']
Predicted Sentiment: ['positive']
