In [1]:
!pip install imbalanced-learn
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
import google
from google.colab import drive
drive.mount('/content/drive')

encode = LabelEncoder()

# the MBTI 500 dataset path in my drive folder
df = pd.read_csv('/content/drive/MyDrive/Dataset_DSAPP/MBTI 500.csv')


Collecting imbalanced-learn
  Downloading imbalanced_learn-0.12.3-py3-none-any.whl.metadata (8.3 kB)
Downloading imbalanced_learn-0.12.3-py3-none-any.whl (258 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.3/258.3 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.12.3
Mounted at /content/drive
the type wise distributionof data: 

type
INTP    24961
INTJ    22427
INFJ    14963
INFP    12134
ENTP    11725
ENFP     6167
ISTP     3424
ENTJ     2955
ESTP     1986
ENFJ     1534
ISTJ     1243
ISFP      875
ISFJ      650
ESTJ      482
ESFP      360
ESFJ      181
Name: count, dtype: int64
type
ENTP    6629
INTP    6629
ESTJ    6629
ESTP    6629
ENTJ    6629
ESFJ    6629
ISFJ    6629
ENFJ    6629
INFJ    6629
INFP    6629
ISFP    6629
ESFP    6629
ENFP    6629
ISTJ    6629
INTJ    6629
ISTP    6629
Name: count, dtype: int64


In [2]:
import numpy as np
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Calculating average number of words in post
df['wordsNoAvg'] = df['posts'].apply(lambda x: np.mean([len(item.split()) for item in x.split('|||')]))

def build_graph_with_extracted_features(df, mbtitype):
    # dataframe for each MBTI type
    df_part = df[df['type'] == mbtitype]


    #  TF-IDF matrix features from text in posts
    vectorizer = TfidfVectorizer(max_features=1000)
    tfidf = vectorizer.fit_transform(df_part['posts'])

    # cosine similarity matrix calculation from tfidf
    cos = cosine_similarity(tfidf)

    # preparing average no of words in post feature
    features = np.hstack((tfidf.toarray(), df_part[['wordsNoAvg']].values))

    # node features tensor
    nodeOfeatures = torch.tensor(features, dtype=torch.float)
    labels = torch.tensor(df_part['numeric_type'].values, dtype=torch.long)

    # Creating edge indices based on cosine similarity (threshold for similarity)
    index_for_edge= []
    thresh = 0.5  # experimental threshold
    for i in range(len(df_part)):
        for j in range(i + 1, len(df_part)):
            if cos[i, j] > thresh:
                index_for_edge.append([i, j])
                index_for_edge.append([j, i])
    index_for_edge = torch.tensor(index_for_edge, dtype=torch.long).t().contiguous()

    return Data(x=nodeOfeatures, edge_index=index_for_edge, y=labels)




In [3]:
!pip install torch_geometric
import networkx as nx
import matplotlib.pyplot as plt
from torch_geometric.utils import to_networkx
from torch_geometric.data import Data
import os

# Converting PyTorch Geometric data to NetworkX graph for visualization
def drawing_graph(data, title):
    G = to_networkx(data, to_undirected=True)
    plt.figure(figsize=(10, 10))
    p = nx.spring_layout(G, seed=42)
    nx.draw(G, p, node_size=50, node_color=data.y.numpy(), cmap=plt.get_cmap('Set1'), with_labels=False)
    plt.title(title)
    plt.show()
    plt.close()



Collecting torch_geometric
  Downloading torch_geometric-2.5.3-py3-none-any.whl.metadata (64 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/64.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.5/64.2 kB[0m [31m4.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━[0m [32m51.2/64.2 kB[0m [31m1.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 kB[0m [31m633.5 kB/s[0m eta [36m0:00:00[0m
Collecting aiohttp (from torch_geometric)
  Downloading aiohttp-3.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.5 kB)
Collecting aiosignal>=1.1.2 (from aiohttp->torch_geometric)
  Downloading aiosignal-1.3.1-py3-none-any.whl.metadata (4.0 kB)
Collecting frozenlist>=1.1.1 (from aiohttp->torch_geometric)
  Downloading frozenlist-1.4.1-cp310-cp310-manylinux_2_5_x

In [4]:
# For each type , the graph gets stored in dictionary
# key of dictionary: type
# value of dictionary : graph
# Visualizing each graph
mbs = df['type'].unique()
graphs = {}

for mb in mbs:
    graphs[mb] = build_graph_with_extracted_features(df, mb)
    title = str(mb) + '_Graph'
    drawing_graph(graphs[mb], title)




In [7]:
# Shiuffling and combining graphs  to construct graph dataset

def combine_graphs(graphs):
    nodes = []
    edges = []
    labels = []

    offset = 0
    for graph in graphs.values():
        num_nodes = graph.x.size(0) # number of nodes

        # Adjusting edge indices to account for combined node indexing
        edge = graph.edge_index + offset
        edges.append(edge)

        nodes.append(graph.x)
        labels.append(graph.y)

        offset += num_nodes

    combnodes = torch.cat(nodes, dim=0)
    combedges = torch.cat(edges, dim=1)
    comblabels = torch.cat(labels, dim=0)

    return Data(x=combnodes, edge_index=combedges, y=comblabels)

combgraph = combine_graphs(graphs)

# permuting the combined graph data to ensure shuffling of data
num_nodes = combgraph.x.size(0)
permute = torch.randperm(num_nodes)
shuffnodes = combgraph.x[permute]
shufflabels = combgraph.y[permute]
shuff_edge_index = combgraph.edge_index

# Turning into final shuffled graph
shuffgraph = Data(x=shuffnodes, edge_index=shuff_edge_index, y=shufflabels)


In [8]:
# data split
# Training dataset size: 60%
# Validation dataset size: 20%
# Test dataset size: 20%
def splittingdata(graph, train_ratio=0.6, val_ratio=0.2):
    num_nodes = graph.x.size(0)
    tr_size = int(train_ratio * num_nodes)
    v_size = int(val_ratio * num_nodes)
    te_size = num_nodes - train_size - val_size

    tr_mask = torch.zeros(num_nodes, dtype=torch.bool)
    v_mask = torch.zeros(num_nodes, dtype=torch.bool)
    te_mask = torch.zeros(num_nodes, dtype=torch.bool)

    tr_mask[:tr_size] = True
    v_mask[tr_size:tr_size + v_size] = True
    te_mask[tr_size + v_size:] = True

    return tr_mask, v_mask, te_mask

train_mask, val_mask, test_mask = splittingdata(shuffgraph)

# Assigning masks to the graph data
shuffgraph.train_mask = train_mask
shuffgraph.val_mask = val_mask
shuffgraph.test_mask = test_mask


In [9]:
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
from torch_geometric.utils import from_networkx, to_networkx
from torch_geometric.nn import GATConv, global_mean_pool
import torch.nn as nn
from torch_geometric.loader import DataLoader
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, roc_auc_score, log_loss, matthews_corrcoef
import matplotlib.pyplot as plt



In [10]:
# Defining the GATClassifier
class GATClassifier(nn.Module):
    def __init__(self, in_dim, hidden_dim, num_heads, n_classes):
        super(GATClassifier, self).__init__()
        self.conv1 = GATConv(in_dim, hidden_dim, heads=num_heads, dropout=0.6)
        self.conv2 = GATConv(hidden_dim * num_heads, hidden_dim, heads=1, concat=False, dropout=0.6)
        self.fc = nn.Linear(hidden_dim, n_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.elu(x)
        x = self.conv2(x, edge_index)
        return x

# Initializing the model, optimizer, and loss function
in_dim = shuffgraph.x.shape[1]
hidden_dim = 64
num_heads = 8
n_classes = len(encode.classes_)
model = GATClassifier(in_dim, hidden_dim, num_heads, n_classes)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)
criterion = nn.CrossEntropyLoss()

# Define the training loop
def train(model, optimizer, criterion, data, train_mask):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = criterion(out[train_mask], data.y[train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

# Define the evaluation function
def evaluate(model, criterion, data, mask):
    model.eval()
    with torch.no_grad():
        out = model(data)
        loss = criterion(out[mask], data.y[mask]).item()
        pred = out[mask].max(dim=1)[1]
        correct = pred.eq(data.y[mask]).sum().item()
        accuracy = correct / mask.sum().item()
    return loss, accuracy,out[mask]






In [13]:
# Training and validation
n_epochs = 100
train_losses = []
val_losses = []
val_accuracies = []

for epoch in range(n_epochs):
    train_loss = train(model, optimizer, criterion, shuffgraph, shuffgraph.train_mask)
    val_loss, val_accuracy, _ = evaluate(model, criterion, shuffgraph, shuffgraph.val_mask)

    train_losses.append(train_loss)
    val_losses.append(val_loss)
    val_accuracies.append(val_accuracy)

    if epoch % 10 == 0:
        print(f"Epoch {epoch:03d}: Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")

# Evaluation
test_loss, test_accuracy, test_logits = evaluate(model, criterion, shuffgraph, shuffgraph.test_mask)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")

# Calculating F1 score
test_pred = test_logits.max(dim=1)[1]
test_f1 = f1_score(shuffgraph.y[shuffgraph.test_mask].cpu(), test_pred.cpu(), average='weighted')
print(f"Test F1 Score: {test_f1:.4f}")

# Calculating Precision
test_precision = precision_score(shuffgraph.y[shuffgraph.test_mask].cpu(), test_pred.cpu(), average='weighted')
print(f"Test Precision: {test_precision:.4f}")

# Calculating Recall
test_recall = recall_score(shuffgraph.y[shuffgraph.test_mask].cpu(), test_pred.cpu(), average='weighted')
print(f"Test Recall: {test_recall:.4f}")

# Calculaing Matthews Correlation Coefficient
test_mcc = matthews_corrcoef(shuffgraph.y[shuffgraph.test_mask].cpu(), test_pred.cpu())
print(f"Test MCC: {test_mcc:.4f}")

# Save the model
model_path = '/content/drive/MyDrive/Dataset_DSAPP/gat_model.pth'
torch.save(model.state_dict(), model_path)
print(f"Model saved to {model_path}")



Epoch 000: Train Loss: 1.9865, Val Loss: 0.9300, Val Accuracy: 0.8666
Epoch 010: Train Loss: 2.0882, Val Loss: 0.9983, Val Accuracy: 0.8567
Epoch 020: Train Loss: 2.0439, Val Loss: 0.9899, Val Accuracy: 0.8628
Epoch 030: Train Loss: 2.4946, Val Loss: 1.4367, Val Accuracy: 0.6870
Epoch 040: Train Loss: 5.5075, Val Loss: 5.3316, Val Accuracy: 0.2466
Epoch 050: Train Loss: 5.2636, Val Loss: 3.0389, Val Accuracy: 0.4789
Epoch 060: Train Loss: 2.8756, Val Loss: 1.3957, Val Accuracy: 0.7750
Epoch 070: Train Loss: 2.1538, Val Loss: 0.8971, Val Accuracy: 0.8756
Epoch 080: Train Loss: 1.8941, Val Loss: 0.6708, Val Accuracy: 0.8995
Epoch 090: Train Loss: 1.7793, Val Loss: 0.7546, Val Accuracy: 0.8785
Test Loss: 0.7280, Test Accuracy: 0.8689
Test F1 Score: 0.8278
Test Precision: 0.8349
Test Recall: 0.8689
Test MCC: 0.8454


  _warn_prf(average, modifier, msg_start, len(result))


Model saved to /content/drive/MyDrive/Dataset_DSAPP/gat_model.pth


In [18]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import f1_score, precision_score, recall_score, matthews_corrcoef
from torch_geometric.nn import GCNConv

class GCPModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim = 16):
        super(GCPModel, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = F.relu(self.conv2(x, edge_index))
        x = F.dropout(x, training=self.training)
        x = self.fc(x)
        return F.log_softmax(x, dim=1)



In [20]:

def train(model, optimizer, criterion, data, mask):
    model.train()
    optimizer.zero_grad()
    output = model(data)
    loss = criterion(output[mask], data.y[mask])
    loss.backward()
    optimizer.step()
    return loss.item()

def evaluate(model, criterion, data, mask):
    model.eval()
    with torch.no_grad():
        output = model(data)
        loss = criterion(output[mask], data.y[mask])
        pred = output.argmax(dim=1)
        correct = pred[mask].eq(data.y[mask]).sum().item()
        accuracy = correct / mask.sum().item()
    return loss, accuracy, output[mask]


In [21]:
# Initializing the model, optimizer, and criterion
input_dim = shuffgraph.num_node_features
hidden_dim = 64
model = GCPModel(input_dim, hidden_dim, output_dim = 16)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

# Training and validation loop
n_epochs = 100
train_losses = []
val_losses = []
val_accuracies = []

for epoch in range(n_epochs):
    train_loss = train(model, optimizer, criterion, shuffgraph, shuffgraph.train_mask)
    val_loss, val_accuracy, _ = evaluate(model, criterion, shuffgraph, shuffgraph.val_mask)

    train_losses.append(train_loss)
    val_losses.append(val_loss)
    val_accuracies.append(val_accuracy)

    if epoch % 10 == 0:
        print(f"Epoch {epoch:03d}: Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")

# Evaluating on the test set
test_loss, test_accuracy, test_logits = evaluate(model, criterion, shuffgraph, shuffgraph.test_mask)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")

# Calculating evaluation metrics
test_pred = test_logits.argmax(dim=1)

# Calculating F1 score
test_f1 = f1_score(shuffgraph.y[shuffgraph.test_mask].cpu(), test_pred.cpu(), average='weighted')
print(f"Test F1 Score: {test_f1:.4f}")

# Calculating Precision
test_precision = precision_score(shuffgraph.y[shuffgraph.test_mask].cpu(), test_pred.cpu(), average='weighted')
print(f"Test Precision: {test_precision:.4f}")

# Calculating Recall
test_recall = recall_score(shuffgraph.y[shuffgraph.test_mask].cpu(), test_pred.cpu(), average='weighted')
print(f"Test Recall: {test_recall:.4f}")

# Calculating Matthews Correlation Coefficient
test_mcc = matthews_corrcoef(shuffgraph.y[shuffgraph.test_mask].cpu(), test_pred.cpu())
print(f"Test MCC: {test_mcc:.4f}")

# Saveing the model
model_path = '/content/drive/MyDrive/Dataset_DSAPP/gcn_model.pth'
torch.save(model.state_dict(), model_path)
print(f"Model saved to {model_path}")


Epoch 000: Train Loss: 11.3003, Val Loss: 5.9652, Val Accuracy: 0.2062
Epoch 010: Train Loss: 2.4495, Val Loss: 2.2992, Val Accuracy: 0.2062
Epoch 020: Train Loss: 2.2508, Val Loss: 2.1538, Val Accuracy: 0.2391
Epoch 030: Train Loss: 2.1879, Val Loss: 2.1049, Val Accuracy: 0.2394
Epoch 040: Train Loss: 2.1434, Val Loss: 2.0602, Val Accuracy: 0.2066
Epoch 050: Train Loss: 2.0824, Val Loss: 1.9907, Val Accuracy: 0.3620
Epoch 060: Train Loss: 2.0030, Val Loss: 1.8823, Val Accuracy: 0.4861
Epoch 070: Train Loss: 1.9095, Val Loss: 1.7594, Val Accuracy: 0.4767
Epoch 080: Train Loss: 1.8628, Val Loss: 1.6569, Val Accuracy: 0.4296
Epoch 090: Train Loss: 1.7510, Val Loss: 1.4869, Val Accuracy: 0.5766
Test Loss: 1.5122, Test Accuracy: 0.6480
Test F1 Score: 0.5574
Test Precision: 0.5103
Test Recall: 0.6480
Test MCC: 0.5942
Model saved to /content/drive/MyDrive/Dataset_DSAPP/gcn_model.pth


  _warn_prf(average, modifier, msg_start, len(result))
