In [None]:

# Imports
import pandas as pd
import gensim
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv1D, GlobalMaxPooling1D, Bidirectional, LSTM, Dropout, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

# dataset_path = '/content/training_dataset.csv'
dataset_path = '/content/training_dataset (1).csv'
df = pd.read_csv(dataset_path, header=None, names=["comment","target","aspect","sentiment"])
df['aspect'] = df['aspect'].astype(str).str.lower()
df['sentiment'] = df['sentiment'].astype(str).str.lower()
# Drop unused column
df = df.drop(columns=["target"])

print("Dataset sample:")
print(df.head())

In [None]:
valid_aspects = ["policy", "governance", "service", "economy", "corruption"]
valid_sentiments = ["positive", "neutral", "negative"]

df = df[df['aspect'].isin(valid_aspects)]
df = df[df['sentiment'].isin(valid_sentiments)]

df = df[df['comment'].apply(lambda x: 3 <= len(str(x).split()) <= 11)]

print("Filtered dataset shape:", df.shape)

In [None]:
fasttext_path = '/content/drive/MyDrive/cc.ne.300.vec.gz'
fasttext_model = gensim.models.KeyedVectors.load_word2vec_format(fasttext_path)

print("FastText vocab size:", len(fasttext_model.key_to_index))

In [None]:
def embed_comment(comment, max_len=50):
    """
    Convert a comment into a fixed-length embedding matrix.
    - Tokenize into words.
    - Map each word to FastText vector (300-dim).
    - Pad or truncate to max_len tokens.
    """
    tokens = str(comment).strip().split()
    vectors = []
    for tok in tokens:
        if tok in fasttext_model.key_to_index:
            vectors.append(fasttext_model[tok])
        else:
            vectors.append(np.zeros(fasttext_model.vector_size))
    # Pad / truncate
    if len(vectors) < max_len:
        pad = [np.zeros(fasttext_model.vector_size)] * (max_len - len(vectors))
        vectors.extend(pad)
    else:
        vectors = vectors[:max_len]
    return np.array(vectors)

# Build embeddings
X = np.stack([embed_comment(c) for c in df['comment']])
print("Embeddings shape:", X.shape)   # (num_samples, max_len, 300)

## Label Encoding of Output Variables

- `LabelEncoder` is a class provided by the **`sklearn.preprocessing`** module.
- Two independent `LabelEncoder` objects are created to separately encode **aspect** and **sentiment** labels.
- Each encoder learns and stores the unique class labels in the `classes_` attribute.
- The categorical labels are transformed into integer values, as machine learning models require numerical inputs.
- Label encoding is performed **before one-hot encoding**, since one-hot encoding operates on integer-encoded labels.

### Working of `fit_transform()`

- `fit_transform()` performs the **fitting** and **transformation** steps in a single operation.

#### `fit()`
- Identifies all unique class labels in the data
- Sorts the labels lexicographically
- Stores the sorted labels in `encoder.classes_`
- learns so done in the training data
- Should not fit() again on validation data **serious bug** 

1. If fit didnt run then classes_ doesnot exist

#### `transform()`
- Replaces each categorical label with its corresponding integer index
- it doesnot learn anything just replaces the label with integer index only on the validation/test data i.e. already trained ones

### Working of 'to_categorical()'
- it is from tensorflow.keras.utils
- this is **one hot encoding**
- it converts integer class labels into probability target vectors that neural network can learn from.

#### to_categorical() line of code
- this is using the integer labels we just obtained from the fit_transform() and is also calculating the total number of aspects and sentiment labels then applying one hot encoding 
1. e.g. if y_aspect_int = [2, 0, 1] and num_classes = 4 then

y_aspect =
[
 [0, 0, 1, 0],  # governance
 [1, 0, 0, 0],  # corruption
 [0, 1, 0, 0]   # economy
]
 this will be the result after the line of code

- this is done because the loss function is valid only if y is one hot vector. where y is target i.e. aspect and sentiment are our targets
 



In [None]:
aspect_encoder = LabelEncoder()
sentiment_encoder = LabelEncoder()

y_aspect_int = aspect_encoder.fit_transform(df['aspect'])
y_sentiment_int = sentiment_encoder.fit_transform(df['sentiment'])

y_aspect = to_categorical(y_aspect_int, num_classes=len(valid_aspects))
y_sentiment = to_categorical(y_sentiment_int, num_classes=len(valid_sentiments))

print("Aspect classes:", aspect_encoder.classes_)
print("Sentiment classes:", sentiment_encoder.classes_)

## StratifiedShuffleSplit

- this splits the datasets preserving the class distribution of targets.
- it is a object not the split itself.
- In this project, datasets are imbalanced as some classes of aspects or sentiment has many datas than other, so to maintain the **same proportion** of each class in both train and test sets we used this.

### Stratify
1. It means to split the dataset so that each subset has the same proportion of the classes as the original datasets.
2. Essential in multi class classification.

### Random_state=42
- Its significance is that we can get the same combination of samples and classes in our training and test splits i.e. fixed randomness.

### sss.split()
- It doesnot care about the number of classes directly. It uses the class labels and returns the index of the samples in X. where X is comment as target is dropped.
- Indexes are row number of X




In [None]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_idx, test_idx in sss.split(X, y_aspect_int):
    X_train, X_test = X[train_idx], X[test_idx]
    y_aspect_train, y_aspect_test = y_aspect[train_idx], y_aspect[test_idx]
    y_sent_train, y_sent_test = y_sentiment[train_idx], y_sentiment[test_idx]

print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)

In [None]:
input_layer = Input(shape=(50, 300))

# CNN + GlobalMaxPooling
conv = Conv1D(filters=128, kernel_size=3, activation='relu')(input_layer)
pool = GlobalMaxPooling1D()(conv)

# BiLSTM
bilstm = Bidirectional(LSTM(128))(input_layer)

# Concatenate CNN + BiLSTM features
merged = tf.keras.layers.concatenate([pool, bilstm])

# Shared dropout
drop = Dropout(0.5)(merged)

# Task-specific dense layers
sentiment_dense = Dense(64, activation='relu')(drop)
sentiment_output = Dense(3, activation='softmax', name="sentiment")(sentiment_dense)

aspect_dense = Dense(64, activation='relu')(drop)
aspect_output = Dense(5, activation='softmax', name="aspect")(aspect_dense)

# Build model
model = Model(inputs=input_layer, outputs=[sentiment_output, aspect_output])

model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss={"sentiment": "categorical_crossentropy", "aspect": "categorical_crossentropy"},
    metrics={"sentiment": ["accuracy"], "aspect": ["accuracy"]}
)

model.summary()

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True,
    verbose=1
)

history = model.fit(
    X_train,
    {"sentiment": y_sent_train, "aspect": y_aspect_train},
    validation_data=(X_test, {"sentiment": y_sent_test, "aspect": y_aspect_test}),
    epochs=50,
    batch_size=32,
    callbacks=[early_stop],
    verbose=1
)

In [None]:
# --- Training vs Validation Loss ---
plt.figure(figsize=(6, 5))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training vs Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.savefig("training_validation_loss.png", dpi=300, bbox_inches="tight")
plt.show()
# --- Training vs Validation Accuracy (Sentiment) ---
plt.figure(figsize=(6, 5))
plt.plot(history.history['sentiment_accuracy'], label='Training Sentiment Accuracy')
plt.plot(history.history['val_sentiment_accuracy'], label='Validation Sentiment Accuracy')
plt.title('Training vs Validation Accuracy (Sentiment)')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.savefig("training_validation_accuracy.png", dpi=300, bbox_inches="tight")
plt.show()


In [None]:
sentiment_probs, aspect_probs = model.predict(X_test)

y_sent_pred = np.argmax(sentiment_probs, axis=1)
y_asp_pred = np.argmax(aspect_probs, axis=1)

y_sent_true = np.argmax(y_sent_test, axis=1)
y_asp_true = np.argmax(y_aspect_test, axis=1)

print("=== Sentiment Classification Report ===")
print(classification_report(y_sent_true, y_sent_pred, target_names=sentiment_encoder.classes_))

print("=== Aspect Classification Report ===")
print(classification_report(y_asp_true, y_asp_pred, target_names=aspect_encoder.classes_))

# Confusion Matrices
cm_sent = confusion_matrix(y_sent_true, y_sent_pred)
plt.figure(figsize=(6, 5))
sns.heatmap(cm_sent, annot=True, fmt="d", cmap="Blues",
            xticklabels=sentiment_encoder.classes_,
            yticklabels=sentiment_encoder.classes_)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Sentiment Confusion Matrix")
plt.show()

cm_asp = confusion_matrix(y_asp_true, y_asp_pred)
plt.figure(figsize=(7, 6))
sns.heatmap(cm_asp, annot=True, fmt="d", cmap="Greens",
            xticklabels=aspect_encoder.classes_,
            yticklabels=aspect_encoder.classes_)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Aspect Confusion Matrix")
plt.show()

In [None]:
from sklearn.metrics import precision_recall_curve, average_precision_score

# --- Precision-Recall Curves for Sentiment ---
plt.figure(figsize=(8, 6))
for i, class_name in enumerate(sentiment_encoder.classes_):
    y_true_bin = (y_sent_true == i).astype(int)
    y_score = sentiment_probs[:, i]
    precision, recall, _ = precision_recall_curve(y_true_bin, y_score)
    ap = average_precision_score(y_true_bin, y_score)
    plt.plot(recall, precision, label=f"{class_name} (AP={ap:.2f})")

plt.title("Precision-Recall Curve (Sentiment)")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.legend()
plt.grid(True)
plt.savefig("precision_recall_sentiment.png", dpi=300, bbox_inches="tight")
plt.show()

# --- Precision-Recall Curves for Aspect ---
plt.figure(figsize=(8, 6))
for i, class_name in enumerate(aspect_encoder.classes_):
    y_true_bin = (y_asp_true == i).astype(int)
    y_score = aspect_probs[:, i]
    precision, recall, _ = precision_recall_curve(y_true_bin, y_score)
    ap = average_precision_score(y_true_bin, y_score)
    plt.plot(recall, precision, label=f"{class_name} (AP={ap:.2f})")

plt.title("Precision-Recall Curve (Aspect)")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.legend()
plt.grid(True)
plt.savefig("precision_recall_aspect.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
from sklearn.calibration import calibration_curve

# --- Calibration Curve for Sentiment ---
plt.figure(figsize=(8, 6))
for i, class_name in enumerate(sentiment_encoder.classes_):
    y_true_bin = (y_sent_true == i).astype(int)
    y_score = sentiment_probs[:, i]
    prob_true, prob_pred = calibration_curve(y_true_bin, y_score, n_bins=10)
    plt.plot(prob_pred, prob_true, marker='o', label=f"{class_name}")

plt.plot([0, 1], [0, 1], linestyle='--', color='gray')  # perfect calibration line
plt.title("Calibration Curve (Sentiment)")
plt.xlabel("Mean Predicted Probability")
plt.ylabel("Fraction of Positives")
plt.legend()
plt.grid(True)
plt.show()

# --- Calibration Curve for Aspect ---
plt.figure(figsize=(8, 6))
for i, class_name in enumerate(aspect_encoder.classes_):
    y_true_bin = (y_asp_true == i).astype(int)
    y_score = aspect_probs[:, i]
    prob_true, prob_pred = calibration_curve(y_true_bin, y_score, n_bins=10)
    plt.plot(prob_pred, prob_true, marker='o', label=f"{class_name}")

plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.title("Calibration Curve (Aspect)")
plt.xlabel("Mean Predicted Probability")
plt.ylabel("Fraction of Positives")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize

# --- ROC Curves for Sentiment ---
# Binarize true labels for multi-class ROC
y_sent_true_bin = label_binarize(y_sent_true, classes=range(len(sentiment_encoder.classes_)))

plt.figure(figsize=(8, 6))
for i, class_name in enumerate(sentiment_encoder.classes_):
    fpr, tpr, _ = roc_curve(y_sent_true_bin[:, i], sentiment_probs[:, i])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f"{class_name} (AUC={roc_auc:.2f})")

plt.plot([0, 1], [0, 1], linestyle='--', color='gray')  # diagonal line
plt.title("ROC Curve (Sentiment)")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.grid(True)
plt.savefig("roc_curve_sentiment.png", dpi=300, bbox_inches="tight")
plt.show()