<a href="https://colab.research.google.com/github/VicDc/VIC_/blob/main/FIN_8003_A3_NLP_VDC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Setup and Imports**

In [1]:
# Import libraries and set up environment
import os
import re
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# NLP & Text Processing
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
# Hugging Face Transformers for BERT
from transformers import BertTokenizer, BertModel

# Computer Vision & Deep Learning
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tensorflow.keras.preprocessing import image

# Machine Learning utilities
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Suppress warnings (optional)
import warnings
warnings.filterwarnings("ignore")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# **2: Load Dataset and Suggest Corrections**

In [2]:
# Define file path for the Excel file (text data)
csv_path = "/content/drive/MyDrive/twitter_Kaggle/LabeledText(ORIG).xlsx"

# Load the Excel file into a DataFrame
df = pd.read_excel(csv_path)
print("First 5 rows of the original dataset:")
print(df.head())

# Check the original column names
print("\nOriginal columns:", df.columns.tolist())

# Suggest corrections: Strip whitespace, convert to lower case and replace spaces with underscores
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
print("\nCorrected columns:", df.columns.tolist())

# Check for missing values
print("\nMissing values in each column:")
print(df.isnull().sum())


First 5 rows of the original dataset:
  File Name                                            Caption Sentiment
0     1.txt      How I feel today #legday #jelly #aching #gym   negative
1    10.txt  @ArrivaTW absolute disgrace two carriages from...  negative
2   100.txt  This is my Valentine's from 1 of my nephews. I...  positive
3  1000.txt  betterfeelingfilms: RT via Instagram: First da...   neutral
4  1001.txt         Zoe's first love #Rattled @JohnnyHarper15   positive

Original columns: ['File Name', 'Caption', 'Sentiment']

Corrected columns: ['file_name', 'caption', 'sentiment']

Missing values in each column:
file_name    0
caption      0
sentiment    0
dtype: int64


# **3: Align Sentiment Labels**

In [3]:
# Assuming the sentiment column holds the labels, ensure they match folder names
# e.g., if the folder names are "Negative", "Neutral", "Positive", we convert the labels accordingly.
if 'sentiment' in df.columns:
    df['sentiment'] = df['sentiment'].astype(str).str.strip().str.capitalize()
    print("Unique sentiment labels after correction:", df['sentiment'].unique())
else:
    print("Error: 'sentiment' column not found. Please check the dataset.")


Unique sentiment labels after correction: ['Negative' 'Positive' 'Neutral']


# **4: Text Preprocessing & Feature Extraction (NLP Component)**

In [4]:
# Check available columns in the dataset
print("Available columns in dataset:", df.columns.tolist())

# Define the expected column for text data (update if needed)
expected_text_column = 'caption'  # Change this to the correct column name if it differs (e.g., 'tweet' or 'content')

# Verify if the expected column exists; if not, prompt an error with available columns
if expected_text_column not in df.columns:
    raise KeyError(
        f"Error: Column '{expected_text_column}' not found. "
        f"Please check the dataset and update the expected_text_column variable. "
        f"Available columns are: {df.columns.tolist()}"
    )

# Define a text preprocessing function: lowercasing, cleaning, and stopword removal
def preprocess_text(text):
    text = text.lower()
    # Remove URLs, mentions, hashtags
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#', '', text)
    # Remove non-alphabetic characters
    text = re.sub('[^a-zA-Z]', ' ', text)
    tokens = text.split()
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return " ".join(filtered_tokens)

# Apply preprocessing to the text column
df['clean_text'] = df[expected_text_column].apply(preprocess_text)
print("Sample original and cleaned text:")
print(df[[expected_text_column, 'clean_text']].head())

# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to extract BERT embedding (using the CLS token)
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128, padding='max_length')
    outputs = model(**inputs)
    cls_embedding = outputs.last_hidden_state[:, 0, :].detach().numpy().flatten()
    return cls_embedding

# Compute and store BERT embeddings for each cleaned text
df['bert_embedding'] = df['clean_text'].apply(lambda x: get_bert_embedding(x))
print("\nBERT embeddings computed for sample rows:")
print(df[['clean_text', 'bert_embedding']].head())


Available columns in dataset: ['file_name', 'caption', 'sentiment']
Sample original and cleaned text:
                                             caption  \
0      How I feel today #legday #jelly #aching #gym    
1  @ArrivaTW absolute disgrace two carriages from...   
2  This is my Valentine's from 1 of my nephews. I...   
3  betterfeelingfilms: RT via Instagram: First da...   
4         Zoe's first love #Rattled @JohnnyHarper15    

                                          clean_text  
0                 feel today legday jelly aching gym  
1  absolute disgrace two carriages bangor half wa...  
2  valentine nephews elated sometimes little thin...  
3  betterfeelingfilms rt via instagram first day ...  
4                             zoe first love rattled  


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]


BERT embeddings computed for sample rows:
                                          clean_text  \
0                 feel today legday jelly aching gym   
1  absolute disgrace two carriages bangor half wa...   
2  valentine nephews elated sometimes little thin...   
3  betterfeelingfilms rt via instagram first day ...   
4                             zoe first love rattled   

                                      bert_embedding  
0  [0.039133992, 0.080824226, 0.06509155, -0.0491...  
1  [-0.12805556, 0.20985381, 0.016016781, -0.1370...  
2  [-0.40478364, 0.22710635, -0.096689515, 0.0115...  
3  [0.08460065, 0.116298795, 0.10878938, -0.14982...  
4  [-0.24930638, 0.17938097, -0.09484068, -0.0503...  


# **5: Build a Text Sentiment Classifier**

In [5]:
# Create feature matrix and target from text data
X_text = np.vstack(df['bert_embedding'].values)
y_text = df['sentiment']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_text, y_text, test_size=0.2, random_state=42)

# Train a logistic regression classifier on text features
clf_text = LogisticRegression(max_iter=1000)
clf_text.fit(X_train, y_train)

# Evaluate the classifier
y_pred = clf_text.predict(X_test)
print("Text Sentiment Classification Report:")
print(classification_report(y_test, y_pred))


Text Sentiment Classification Report:
              precision    recall  f1-score   support

    Negative       0.57      0.61      0.59       284
     Neutral       0.60      0.55      0.57       367
    Positive       0.66      0.68      0.67       323

    accuracy                           0.61       974
   macro avg       0.61      0.61      0.61       974
weighted avg       0.61      0.61      0.61       974



# **6: Image Data Preparation (Computer Vision Component)**

In [6]:
image_folder_path = "/content/drive/MyDrive/twitter_Kaggle/image"

# List to hold image file paths and corresponding sentiment labels
image_paths = []
image_labels = []

# The dataset structure assumes three folders: Negative, Neutral, Positive
for sentiment in ['Negative', 'Neutral', 'Positive']:
    folder_path = os.path.join(image_folder_path, sentiment)
    for file_name in os.listdir(folder_path):
        if file_name.lower().endswith(('.png', '.jpg', '.jpeg')):
            image_paths.append(os.path.join(folder_path, file_name))
            image_labels.append(sentiment)

print("Total images loaded:", len(image_paths))

Total images loaded: 4869


# **7: Extract Image Features Using a Pre-trained CNN**

In [7]:
# Function to load and preprocess an image for ResNet50
def load_and_preprocess_image(img_path, target_size=(224, 224)):
    img = image.load_img(img_path, target_size=target_size)
    img_array = image.img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)
    img_array = preprocess_input(img_array)
    return img_array

# Load a pre-trained ResNet50 model for feature extraction (without the top classification layers)
resnet_model = ResNet50(weights='imagenet', include_top=False, pooling='avg')

# Function to extract features from an image file
def extract_image_features(img_path):
    img_array = load_and_preprocess_image(img_path)
    features = resnet_model.predict(img_array)
    return features.flatten()

# Extract features for each image (for demonstration, this may be slow on large datasets)
image_features = [extract_image_features(path) for path in image_paths]
X_image = np.vstack(image_features)
y_image = np.array(image_labels)

print("Shape of extracted image features:", X_image.shape)


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m94765736/94765736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 0us/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1

# **8: Build an Image Sentiment Classifier**

In [8]:
# Split image features into training and testing sets
X_train_img, X_test_img, y_train_img, y_test_img = train_test_split(X_image, y_image, test_size=0.2, random_state=42)

# Train a logistic regression classifier on image features
clf_image = LogisticRegression(max_iter=1000)
clf_image.fit(X_train_img, y_train_img)

# Evaluate the image classifier
y_pred_img = clf_image.predict(X_test_img)
print("Image Sentiment Classification Report:")
print(classification_report(y_test_img, y_pred_img))


Image Sentiment Classification Report:
              precision    recall  f1-score   support

    Negative       0.38      0.36      0.37       307
     Neutral       0.39      0.40      0.39       347
    Positive       0.38      0.39      0.39       320

    accuracy                           0.38       974
   macro avg       0.38      0.38      0.38       974
weighted avg       0.38      0.38      0.38       974



# **9: Fusion of Text and Image Features & Final Classification**

In [None]:
# For fusion, we assume that each row in the text dataset has a corresponding image.
# Here, for each text entry, we randomly select one image from the folder that matches its sentiment.

def get_random_image_feature(sentiment):
    folder = os.path.join(image_folder_path, sentiment)
    files = [os.path.join(folder, f) for f in os.listdir(folder) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
    if files:
        random_file = random.choice(files)
        return extract_image_features(random_file)
    else:
        # If no image is found, return a zero vector matching the expected dimension (ResNet50 with pooling avg gives 2048)
        return np.zeros((2048,))

# Combine BERT (text) embeddings and ResNet (image) features for each entry in the DataFrame
combined_features = []
for idx, row in df.iterrows():
    text_feat = row['bert_embedding']
    # Using the sentiment label to select an image from the corresponding folder
    sentiment_label = row['sentiment']
    image_feat = get_random_image_feature(sentiment_label)
    combined = np.concatenate([text_feat, image_feat])
    combined_features.append(combined)

combined_features = np.vstack(combined_features)
y_combined = df['sentiment']

print("Shape of combined (fusion) features:", combined_features.shape)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43

# **10: Train and Evaluate the Fusion Model**

In [12]:
# Split the combined features into training and testing sets
X_train_comb, X_test_comb, y_train_comb, y_test_comb = train_test_split(combined_features, y_combined, test_size=0.2, random_state=42)

# Train a logistic regression classifier on the combined features
clf_fusion = LogisticRegression(max_iter=1000)
clf_fusion.fit(X_train_comb, y_train_comb)

# Evaluate the fusion classifier
y_pred_comb = clf_fusion.predict(X_test_comb)
print("Fusion Model (Text + Image) Classification Report:")
print(classification_report(y_test_comb, y_pred_comb))


Fusion Model (Text + Image) Classification Report:
              precision    recall  f1-score   support

    Negative       0.68      0.69      0.68       284
     Neutral       0.76      0.72      0.74       367
    Positive       0.73      0.75      0.74       323

    accuracy                           0.72       974
   macro avg       0.72      0.72      0.72       974
weighted avg       0.72      0.72      0.72       974

