Reading the File

In [None]:
import numpy as np
import pandas as pd

df = pd.read_csv('Datasets/ottdata.csv') #Change Path to test code on rest of the datasets
df.head()

In [None]:
empty_texts = df[df['review'].apply(lambda x: not x.strip())]  # This checks for empty or whitespace-only strings

print("Number of empty text rows:", len(empty_texts))
if not empty_texts.empty:
    print("Empty text rows:")
    print(empty_texts)

In [None]:
df.shape

Loading SenticNet7

In [None]:
import pandas as pd

def load_sentic_word_from_excel(filename):
    """ Load SenticNet data from an Excel file into a dictionary. """
    senticNet = {}
    df = pd.read_excel(filename)
    for index, row in df.iterrows():
        word = row['CONCEPT']
        score = row['POLARITY INTENSITY']
        senticNet[word] = score
    return senticNet

# Load SenticNet from an Excel file
senticNet = load_sentic_word_from_excel('/Users/prathanaphukon/Desktop/Complete Final Code/senticnet7.xlsx')

Computing the weighted Adjacency Matrices for each review

In [None]:
import spacy

nlp = spacy.load('en_core_web_sm')

def dependency_adj_matrix(review, nlp, senticNet):
    """ Create adjacency matrix for dependency and sentiment from given review text. """
    doc = nlp(review)
    tokens = [token for token in doc if not token.is_punct and not token.is_space]
    seq_len = len(tokens)
    matrix = np.zeros((seq_len, seq_len), dtype='float32')

    for i, token in enumerate(tokens):
        token_sentic = senticNet.get(token.lemma_.lower(), 0)
        token_is_aspect = 1 if token.pos_ == 'NOUN' else 0

        for child in token.children:
            if child in tokens:
                j = tokens.index(child)
                child_sentic = senticNet.get(child.lemma_.lower(), 0)
                child_is_aspect = 1 if child.pos_ == 'NOUN' else 0

                dij = 1  # There's a direct dependency
                sij = token_sentic + child_sentic
                tij = 1 if (token_is_aspect or child_is_aspect) else 0

                adjacency_value = dij * (sij + tij + 1)

                matrix[i][j] = adjacency_value
                matrix[j][i] = adjacency_value

    np.fill_diagonal(matrix, 1)
    return matrix

# Assuming df['Review'] has been populated
df['adjacency_matrix'] = df['review'].apply(lambda review: dependency_adj_matrix(review, nlp, senticNet))

Saving the adjacency matrices

In [None]:
import pickle

# Save embeddings to a pickle file
with open('adjacency_matrices_ott.pkl', 'wb') as f:
    pickle.dump(df['adjacency_matrix'], f)

In [None]:
import pickle

# Load embeddings from a pickle file
with open('adjacency_matrices_ott.pkl', 'rb') as f:
    adjacency_matrices = pickle.load(f)

In [None]:
# Check the first few adjacency matrices to understand their structure
print("Sample adjacency matrix shapes:")
print(adjacency_matrices.apply(lambda x: np.array(x).shape).head())

In [None]:
print(adjacency_matrices.shape)
print(adjacency_matrices[0])

In [None]:
import numpy as np

# Assuming adjacency_matrices is an array of adjacency matrices
print(adjacency_matrices.shape)

# Process only the first 5 adjacency matrices
for idx in range(5):  # Adjust here to select how many matrices to process
    adj_matrix = adjacency_matrices[idx]
    
    # Extract the first 5 rows
    first_5_rows = adj_matrix[:5, :]
    
    # Find the maximum and minimum values in these rows
    max_value = np.max(first_5_rows)
    min_value = np.min(first_5_rows)
    
    print(f"Matrix {idx + 1}: Max value in the first 5 rows is {max_value}")
    print(f"Matrix {idx + 1}: Min value in the first 5 rows is {min_value}")

Glove Embeddings for Review Tokens

In [None]:
#Load Glove Embeddings
def load_glove_embeddings(path):
    embeddings_index = {}
    with open(path, 'r', encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

glove_embeddings = load_glove_embeddings('glove.6B.300d.txt')

In [None]:
def tokenize_reviews(reviews):
    """Tokenizes reviews and returns a list of lists of token texts."""
    tokenized_reviews = []
    for review in reviews:
        doc = nlp(review)
        tokens = [token.text for token in doc if not token.is_punct and not token.is_space]
        tokenized_reviews.append(tokens)
    return tokenized_reviews

df['tokenized_reviews'] = tokenize_reviews(df['review'])

In [None]:
import numpy as np

def get_embeddings(tokens, embeddings_index):
    """Converts token lists to a list of embeddings"""
    embedded_docs = []
    for doc in tokens:
        doc_embeddings = []
        for word in doc:
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                doc_embeddings.append(embedding_vector)
            else:
                doc_embeddings.append(np.zeros(300))  # Fill with zeros for missing embeddings
        embedded_docs.append(np.array(doc_embeddings))
    return embedded_docs

# Get embeddings for each token in the reviews
df['glove_embedding'] = get_embeddings(df['tokenized_reviews'], glove_embeddings)

In [None]:
# Check the first few adjacency matrices to understand their structure
print("Sample glove embedding shapes:")
print(df['glove_embedding'].apply(lambda x: np.array(x).shape).head())

In [None]:
print(df['glove_embedding'].shape)
print(df['glove_embedding'][0])

In [None]:
import pickle

# Save embeddings to a pickle file
with open('glove_ott.pkl', 'wb') as f:
    pickle.dump(df['glove_embedding'], f)

In [None]:
import pickle

# Load embeddings from a pickle file
with open('glove_ott.pkl', 'rb') as f:
    glove_embeddings = pickle.load(f)

In [None]:
print(glove_embeddings.shape)
print(glove_embeddings[1].shape)
print(glove_embeddings[1])
print(glove_embeddings.dtype)

In [None]:
print(glove_embeddings.shape)  # For DataFrame
print(glove_embeddings.head())  # To see the top rows

LSTM Node Features

In [None]:
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch

class ReviewDataset(Dataset):
    def __init__(self, embeddings, labels):
        """
        Args:
            embeddings (list of torch.Tensor): The list containing GloVe embeddings for each review.
            labels (iterable): Corresponding labels for the embeddings.
        """
        self.embeddings = [torch.tensor(e, dtype=torch.float32) if not isinstance(e, torch.Tensor) else e for e in embeddings]
        self.labels = labels

    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, idx):
        return self.embeddings[idx], self.labels[idx]

In [None]:
def collate_fn(batch):
    embeddings, labels = zip(*batch)
    lengths = torch.tensor([len(e) for e in embeddings], dtype=torch.long)
    embeddings_padded = pad_sequence(embeddings, batch_first=True, padding_value=0.0)
    labels = torch.tensor(labels, dtype=torch.long)
    return embeddings_padded, labels, lengths

In [None]:
labels = df['label']

In [None]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class BiLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, dropout):
        super(BiLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
        # Bidirectional LSTM
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers,
                            batch_first=True, dropout=dropout, bidirectional=True)
        
        # Fully connected layer to transform back to input feature size
        self.fc = nn.Linear(hidden_dim * 2, input_dim)  # x2 for bidirection

    def forward(self, x, lengths):
        # Pack the padded sequence
        x_packed = pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)

        # Process with LSTM
        packed_output, (hidden, cell) = self.lstm(x_packed)

        # Unpack sequence
        output, input_sizes = pad_packed_sequence(packed_output, batch_first=True)

        # Pass the output through a linear layer and use tanh to keep the output bounded
        output = torch.tanh(self.fc(output))

        return output

In [None]:
import torch
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

# Setup DataLoader and Model
# --------------------------
# This should match the actual data you have and how your dataset is setup
dataset = ReviewDataset(glove_embeddings, labels)  # assuming glove_embeddings and labels are defined
dataloader = DataLoader(dataset, batch_size=32, collate_fn=collate_fn, shuffle=False)

# Define the Bi-LSTM Model
model = BiLSTM(input_dim=300, hidden_dim=150, num_layers=2, dropout=0.5)
if torch.cuda.is_available():
    model.cuda()  # Move model to GPU if available

# Loss Function and Optimizer
# ---------------------------
criterion = torch.nn.MSELoss()  # This should be chosen based on your specific task
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop
# -------------
num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    batch_count = 0

    for embeddings_batch, labels_batch, lengths in dataloader:
        if torch.cuda.is_available():
            embeddings_batch = embeddings_batch.cuda()  # Move data to GPU if available
            labels_batch = labels_batch.cuda()
        
        optimizer.zero_grad()
        
        outputs = model(embeddings_batch, lengths)
        
        # Compute the loss; 
        loss = criterion(outputs, embeddings_batch)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        batch_count += 1
    
    average_loss = total_loss / batch_count
    print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {average_loss:.4f}")

In [None]:
import torch
import pickle

# Ensure the model is in evaluation mode
model.eval()

# Collect embeddings without altering their shapes
unbatched_embeddings = []

# Disable gradient computation for inference
with torch.no_grad():
    for embeddings_batch, labels_batch, lengths in dataloader:
        if torch.cuda.is_available():
            embeddings_batch = embeddings_batch.cuda()
        
        # Compute embeddings
        outputs = model(embeddings_batch, lengths)

        if torch.cuda.is_available():
            outputs = outputs.cpu()  # If using GPU, transfer outputs back to CPU
        
        # Append the outputs to the list, respecting the original batch separation
        for i in range(len(outputs)):
            unbatched_embeddings.append(outputs[i][:lengths[i]])  # Only append up to the original length of each sequence

# Now, `unbatched_embeddings` is a list of tensors where each tensor corresponds to an input sequence

In [None]:
# Check and print the shapes of the embeddings to verify
for i, emb in enumerate(unbatched_embeddings):
    print(f"Shape of embedding {i + 1}: {emb.shape}")

Saving the LSTM Node Features

In [None]:
# Save the unbatched embeddings to a pickle file
with open('unbatched_embeddings.pkl', 'wb') as f:
    pickle.dump(unbatched_embeddings, f)

print("Unbatched embeddings have been saved to 'unbatched_embeddings.pkl'.")

In [None]:
import pickle

# Load embeddings from a pickle file
with open('unbatched_embeddings.pkl', 'rb') as f:
    lstm_embeddings = pickle.load(f)

In [None]:
#Save as a torch object if file is of a larger size

# import torch

# # Save the embeddings to a file 
# torch.save(unbatched_embeddings, '40lstm_embeddings.pt')

In [None]:
# import torch

# # Load the embeddings from a file
# lstm_embeddings = torch.load('40lstm_embeddings.pt')

Preparing Data

In [None]:
from torch_geometric.data import Data
import torch
import numpy as np

def create_data_objects(adjacency_matrices, loaded_embeddings, labels):
    data_list = []

    for adj_matrix, node_features, label in zip(adjacency_matrices, lstm_embeddings, labels):
        edge_indices = np.transpose(np.nonzero(adj_matrix)).astype(np.int64)
        edge_index = torch.tensor(edge_indices, dtype=torch.long).t().contiguous()
        x = torch.tensor(node_features, dtype=torch.float)
        y = torch.tensor([label], dtype=torch.float)  # Make sure label is a tensor

        data = Data(x=x, edge_index=edge_index, y=y)
        data_list.append(data)

    return data_list

In [None]:
labels = torch.tensor(df['label'].values, dtype=torch.float)

data_list = create_data_objects(adjacency_matrices, lstm_embeddings, labels)

In [None]:
def verify_data_list(data_list):
    for i, data in enumerate(data_list):
        print(f"Graph {i}:")
        print(f"  - Node features shape (x): {data.x.shape}")  # Shape of the node feature matrix
        print(f"  - Edge index shape (edge_index): {data.edge_index.shape}")  # Shape of the edge index matrix
        print(f"  - Number of nodes: {data.num_nodes}")  # Total number of nodes
        print(f"  - Number of edges: {data.num_edges}")  # Total number of edges
        if data.y.numel() == 1:
            print(f"  - Label Value: {data.y.item()}")  # Use .item() for single-element tensors
        else:
            print(f"  - Labels: {data.y}")  # Print the whole tensor if it contains more than one element
        # Print a newline for better separation between graphs
        print()

# Call the function to print the data shapes and counts
verify_data_list(data_list)

Save the graphs

In [None]:
with open('data_list_ott.pkl', 'wb') as f:
    pickle.dump(data_list, f)

In [None]:
import pickle

# Load embeddings from a pickle file
with open('data_list_ott.pkl', 'rb') as f:
    data_list = pickle.load(f)

Preliminary Analysis

In [None]:
import networkx as nx
import matplotlib.pyplot as plt

def plot_graph_statistics(data_list):
    node_counts = [data.num_nodes for data in data_list]
    edge_counts = [data.num_edges for data in data_list]

    # Initialize lists for degrees and centralities
    all_degrees = []
    all_betweenness = []
    all_closeness = []
    all_degree_centrality = []
    all_harmonic_centrality = []

    # Iterate over each graph to calculate degrees and centralities
    for data in data_list:
        G = nx.Graph()  # Assuming undirected graphs; use nx.DiGraph() for directed graphs
        for i in range(data.edge_index.shape[1]):  # Number of edges
            src = data.edge_index[0][i].item()
            dst = data.edge_index[1][i].item()
            G.add_edge(src, dst)  # Add edges to the graph
        
        # Calculate degrees for each node
        degrees = [degree for _, degree in G.degree()]
        all_degrees.extend(degrees)
        
        # Calculate betweenness centrality
        betweenness = list(nx.betweenness_centrality(G).values())
        all_betweenness.extend(betweenness)
        
        # Calculate closeness centrality
        closeness = list(nx.closeness_centrality(G).values())
        all_closeness.extend(closeness)
        
        # Calculate degree centrality
        degree_centrality = list(nx.degree_centrality(G).values())
        all_degree_centrality.extend(degree_centrality)
        
        # Calculate harmonic centrality
        harmonic_centrality = list(nx.harmonic_centrality(G).values())
        all_harmonic_centrality.extend(harmonic_centrality)

    # Plot the graphs
    plt.figure(figsize=(12, 12))

    plt.subplot(3, 2, 1)
    plt.hist(node_counts, bins=30, color='skyblue')
    plt.title('Distribution of Node Counts')
    plt.xlabel('Number of Nodes')
    plt.ylabel('Frequency')

    plt.subplot(3, 2, 2)
    plt.hist(edge_counts, bins=30, color='salmon')
    plt.title('Distribution of Edge Counts')
    plt.xlabel('Number of Edges')
    plt.ylabel('Frequency')

    plt.subplot(3, 2, 3)
    plt.hist(all_degrees, bins=30, color='green')
    plt.title('Distribution of Node Degrees')
    plt.xlabel('Degree')
    plt.ylabel('Frequency')

    plt.subplot(3, 2, 4)
    plt.hist(all_betweenness, bins=30, color='purple')
    plt.title('Betweenness Centrality')
    plt.xlabel('Centrality')
    plt.ylabel('Frequency')

    plt.subplot(3, 2, 5)
    plt.hist(all_closeness, bins=30, color='orange')
    plt.title('Closeness Centrality')
    plt.xlabel('Centrality')
    plt.ylabel('Frequency')

    plt.subplot(3, 2, 6)
    plt.hist(all_degree_centrality, bins=30, color='blue')
    plt.title('Degree Centrality')
    plt.xlabel('Centrality')
    plt.ylabel('Frequency')

    plt.tight_layout()
    plt.show()

plot_graph_statistics(data_list)

Splitting the Data

In [None]:
from torch_geometric.data import DataLoader
import numpy as np

def train_val_test_split(data_list, train_ratio=0.8, val_ratio=0.1):
    total_size = len(data_list)
    train_size = int(total_size * train_ratio)
    val_size = int(total_size * val_ratio)
    test_size = total_size - train_size - val_size

    # Shuffle data indices
    indices = np.random.permutation(total_size)
    train_indices = indices[:train_size]
    val_indices = indices[train_size:train_size + val_size]
    test_indices = indices[train_size + val_size:]

    train_dataset = [data_list[i] for i in train_indices]
    val_dataset = [data_list[i] for i in val_indices]
    test_dataset = [data_list[i] for i in test_indices]

    return train_dataset, val_dataset, test_dataset

# Assuming data_list is defined as shown in the previous step
train_dataset, val_dataset, test_dataset = train_val_test_split(data_list)

In [None]:
# Define the batch size
batch_size = 64

# Create DataLoaders for each dataset
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

GCN Model Training

In [None]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_mean_pool

class GCN(torch.nn.Module):
    def __init__(self, num_features, num_classes):
        super(GCN, self).__init__()
        # Increasing depth by adding an extra convolutional layer
        self.conv1 = GCNConv(num_features, 64)  # First layer with 64 units
        self.conv2 = GCNConv(64, 128)  # Second layer with 128 units
        self.conv3 = GCNConv(128, 256)  # Third layer with 256 units
        self.dropout = torch.nn.Dropout(0.5)
        self.fc = torch.nn.Linear(256, num_classes)  # Adjusting the linear layer to match the output of the last GCN layer

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        
        # Apply the first GCN layer and ReLU activation
        x = F.relu(self.conv1(x, edge_index))
        x = self.dropout(x)

        # Apply the second GCN layer and ReLU activation
        x = F.relu(self.conv2(x, edge_index))
        x = self.dropout(x)

        # Apply the third GCN layer and ReLU activation
        x = F.relu(self.conv3(x, edge_index))
        
        # Global mean pooling to aggregate node features to the graph-level
        x = global_mean_pool(x, batch)
        
        # Apply the final fully connected layer
        x = self.fc(x)
        
        return F.log_softmax(x, dim=1)


In [None]:
def train(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0
    for data in loader:
        optimizer.zero_grad()
        out = model(data)
        loss = criterion(out, data.y.long())
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate(model, loader, criterion):
    model.eval()
    total_loss = 0
    correct = 0
    with torch.no_grad():
        for data in loader:
            out = model(data)
            pred = out.argmax(dim=1)
            total_loss += criterion(out, data.y.long()).item()
            correct += int((pred == data.y).sum())
    return total_loss / len(loader), correct / len(loader.dataset)

model = GCN(num_features=300, num_classes=2)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)
criterion = torch.nn.NLLLoss()

# Early Stopping and Training Loop
best_val_loss = float('inf')
patience = 10
patience_counter = 0

for epoch in range(200):
    train_loss = train(model, train_loader, optimizer, criterion)
    val_loss, val_acc = evaluate(model, val_loader, criterion)
    print(f'Epoch: {epoch+1}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}%')

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        torch.save(model.state_dict(), 'best_model.pt')
    else:
        patience_counter += 1
        if patience_counter == patience:
            print("Stopping early due to no improvement")
            break

# Load the best model
model.load_state_dict(torch.load('best_model.pt'))

Testing the Model

In [None]:
import torch
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def test_model(model, loader):
    model.eval()
    predictions = []
    labels = []

    with torch.no_grad():
        for data in loader:
            out = model(data)
            pred = out.argmax(dim=1)
            predictions.extend(pred.cpu().numpy())
            labels.extend(data.y.cpu().numpy())

    return predictions, labels

# Load the best model
model.load_state_dict(torch.load('best_model.pt'))
predictions, labels = test_model(model, test_loader)

In [None]:
accuracy = accuracy_score(labels, predictions)
precision = precision_score(labels, predictions, average='binary')
recall = recall_score(labels, predictions, average='binary')
f1 = f1_score(labels, predictions, average='binary')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

def plot_confusion_matrix(labels, predictions):
    cm = confusion_matrix(labels, predictions)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(cmap=plt.cm.Blues)
    plt.title('Confusion Matrix')
    plt.show()

plot_confusion_matrix(labels, predictions)

In [None]:
from sklearn.metrics import precision_recall_curve, auc

def plot_precision_recall_curve(labels, predictions):
    precision, recall, _ = precision_recall_curve(labels, predictions)
    pr_auc = auc(recall, precision)

    plt.figure()
    plt.plot(recall, precision, label=f'Precision-Recall curve (area = {pr_auc:.2f})')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.legend(loc="best")
    plt.show()

model.eval()
probabilities = []

with torch.no_grad():
    for data in test_loader:
        out = model(data)
        prob = torch.softmax(out, dim=1)[:, 1]  #Printing Probabilities for class 1
        probabilities.extend(prob.cpu().numpy())

plot_precision_recall_curve(labels, probabilities)

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve, roc_curve, auc
import torch

def plot_curves(labels, probabilities):
    
    # Compute ROC curve and AUC
    fpr, tpr, _ = roc_curve(labels, probabilities)
    roc_auc = auc(fpr, tpr)
    
    # Plotting ROC Curve
    plt.subplot(1, 2, 2)
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    
    plt.show()

# Assuming model and test_loader are defined and properly configured
model.eval()
probabilities = []

with torch.no_grad():
    for data in test_loader:
        out = model(data)
        prob = torch.softmax(out, dim=1)[:, 1]  #Printing Probabilities for class 1
        probabilities.extend(prob.cpu().numpy())

plot_curves(labels, probabilities)