<a href="https://colab.research.google.com/github/PunitRaveendran/ACM/blob/main/MAIN_ACM_TASK.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# FINAL TASK

In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
from sklearn.metrics import classification_report

# Phase 1


# Phase 3

In [20]:
import numpy as np
import pandas as pd
import re
import string
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, f1_score
def load_txt_file(file_path, num_lines_to_print=5):
    """
    Reads a text file and attempts to parse it into a pandas DataFrame
    with 'label' and 'text' columns. Prints the first few lines to help
    understand the file format.
    """
    data = []
    print(f"Reading file: {file_path}")
    with open(file_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if i < num_lines_to_print:
                print(f"Line {i+1}: {line.strip()}")

            # Attempt to parse the line - split by semicolon
            parts = line.strip().split(';', maxsplit=1)
            if len(parts) == 2:
                text, label = parts  # Correct order: text then label
                data.append({'label': label, 'text': text})
            # Optionally handle lines that don't match the expected format
            # else:
            #     print(f"Skipping malformed line: {line.strip()}")

    if not data:
        print(f"No data parsed from {file_path}. Check file path and format.")
        return pd.DataFrame(columns=['label', 'text']) # Return empty df with columns
    return pd.DataFrame(data)
train_df = load_txt_file('/content/train.txt')
val_df = load_txt_file('/content/val.txt')
test_df = load_txt_file('/content/test.txt')

# Preprocessing - Clean labels in each dataframe before combining or fitting LabelEncoder
train_df['label'] = train_df['label'].str.lower().str.strip()
val_df['label'] = val_df['label'].str.lower().str.strip()
test_df['label'] = test_df['label'].str.lower().str.strip()

# Print unique labels before filtering
print("Unique labels in train_df before filtering:", train_df['label'].unique())
print("Unique labels in val_df before filtering:", val_df['label'].unique())
print("Unique labels in test_df before filtering:", test_df['label'].unique())


# Filter out unwanted labels from each dataframe
train_df = train_df[train_df['label'].isin(['sadness', 'anger', 'love', 'joy'])]
val_df = val_df[val_df['label'].isin(['sadness', 'anger', 'love', 'joy'])]
test_df = test_df[test_df['label'].isin(['sadness', 'anger', 'love', 'joy'])]


all_df = pd.concat([train_df, val_df, test_df], ignore_index=True)


def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\\d+',' ',text)
    return text.strip()

all_df['text_clean'] = all_df['text'].apply(clean_text)

le = LabelEncoder()
# Fit LabelEncoder on the labels that will be used for training, validation, and testing
le.fit(all_df['label'])

max_vocab = 10000
max_len = 100
tokenizer = Tokenizer(num_words=max_vocab)
tokenizer.fit_on_texts(all_df['text_clean'])

X_train = pad_sequences(tokenizer.texts_to_sequences(train_df['text'].apply(clean_text)), maxlen=max_len)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_df['text'].apply(clean_text)), maxlen=max_len)
X_val = pad_sequences(tokenizer.texts_to_sequences(val_df['text'].apply(clean_text)),maxlen=max_len)

# Transform the labels in each dataframe
y_train = le.transform(train_df['label'])
y_test = le.transform(test_df['label'])
y_val = le.transform(val_df['label'])

# ---------- LSTM -----------

model = Sequential([
    Embedding(max_vocab, 64, input_length=max_len),
    LSTM(64),
    Dense(4, activation='softmax') # Changed from 3 to 4
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=5, batch_size=32)

# Evaluate
y_pred = np.argmax(model.predict(X_test), axis=1)
print('Accuracy:', accuracy_score(y_test, y_pred))
print('F1 Score:', f1_score(y_test, y_pred, average='macro'))
print(classification_report(y_test, y_pred, target_names=le.classes_))

# Sample prediction
def predict_text(text):
    seq = pad_sequences(tokenizer.texts_to_sequences([clean_text(text)]), maxlen=max_len)
    pred = np.argmax(model.predict(seq), axis=1)[0]
    return le.classes_[pred]

print(predict_text("I am feeling great about my progress!"))
print(predict_text("I don't care about this."))
print(predict_text("This is terrible and I give up."))

Reading file: /content/train.txt
Line 1: i didnt feel humiliated;sadness
Line 2: i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake;sadness
Line 3: im grabbing a minute to post i feel greedy wrong;anger
Line 4: i am ever feeling nostalgic about the fireplace i will know that it is still on the property;love
Line 5: i am feeling grouchy;anger
Reading file: /content/val.txt
Line 1: im feeling quite sad and sorry for myself but ill snap out of it soon;sadness
Line 2: i feel like i am still looking at a blank canvas blank pieces of paper;sadness
Line 3: i feel like a faithful servant;love
Line 4: i am just feeling cranky and blue;anger
Line 5: i can have for a treat or if i am feeling festive;joy
Reading file: /content/test.txt
Line 1: im feeling rather rotten so im not very ambitious right now;sadness
Line 2: im updating my blog because i feel shitty;sadness
Line 3: i never make her separate from me because i don t ever want her to



[1m422/422[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m134s[0m 55ms/step - accuracy: 0.4762 - loss: 1.1674 - val_accuracy: 0.8278 - val_loss: 0.4268
Epoch 2/5
[1m422/422[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 56ms/step - accuracy: 0.9085 - loss: 0.2584 - val_accuracy: 0.9373 - val_loss: 0.1855
Epoch 3/5
[1m422/422[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 58ms/step - accuracy: 0.9749 - loss: 0.0774 - val_accuracy: 0.9438 - val_loss: 0.1888
Epoch 4/5
[1m422/422[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 56ms/step - accuracy: 0.9833 - loss: 0.0507 - val_accuracy: 0.9420 - val_loss: 0.1950
Epoch 5/5
[1m422/422[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 57ms/step - accuracy: 0.9893 - loss: 0.0336 - val_accuracy: 0.9443 - val_loss: 0.1979
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step
Accuracy: 0.9304093567251462
F1 Score: 0.9080246818187756
              precision    recall  f1-score   support

       

In [17]:
import pandas as pd

def load_txt_file(file_path, num_lines_to_print=5):
    """
    Reads a text file and attempts to parse it into a pandas DataFrame
    with 'label' and 'text' columns. Prints the first few lines to help
    understand the file format.
    """
    data = []
    print(f"Reading file: {file_path}")
    with open(file_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if i < num_lines_to_print:
                print(f"Line {i+1}: {line.strip()}")

            # Attempt to parse the line - split by semicolon
            parts = line.strip().split(';', maxsplit=1)
            if len(parts) == 2:
                text, label = parts  # Correct order: text then label
                data.append({'label': label, 'text': text})
            # Optionally handle lines that don't match the expected format
            # else:
            #     print(f"Skipping malformed line: {line.strip()}")

    if not data:
        print(f"No data parsed from {file_path}. Check file path and format.")
        return pd.DataFrame(columns=['label', 'text']) # Return empty df with columns
    return pd.DataFrame(data)