In [1]:
import pandas as pd
import re

def custom_csv_reader(file_path, encoding='ISO-8859-1'):
    # Initialize lists to hold each column's data
    types = []
    posts = []
    
    # Open the file and read line by line
    with open(file_path, 'r', encoding=encoding) as file:
        for line in file:
            # Split the line at the first comma
            split_line = line.strip().split(',', 1)
            if len(split_line) == 2:
                mbti_type, post_data = split_line
                # Split posts by "|||"
                split_posts = post_data.split('###')
                for post in split_posts:
                    if post.strip() and not re.match(r'^\s*http\S+\s*$', post.strip()):  # Skip empty or pure-link posts
                        types.append(mbti_type)
                        posts.append(post.strip())
            else:
                print("Skipped a line due to unexpected format: ", line)
    
    df = pd.DataFrame({
        'type': types,
        'posts': posts
    })
    
    return df

df = custom_csv_reader('mbti_1.csv')


df = df[df['type'].str.match(r'^[EI][SN][TF][JP]$')]

Skipped a line due to unexpected format:   Naomi Chung's Daydream Art ï¿½ 25 April 2013 A morning glory flower 10AM###68611http://farm9.staticflickr.com/8402/8677841950_e3793c825c.jpg Flickr 

Skipped a line due to unexpected format:   Naomi Chung's Daydream Art ï¿½ 24 april 2013 the morning glory flowers"

Skipped a line due to unexpected format:  /-ï¿½ï¿½? 

Skipped a line due to unexpected format:   My eyes seem to do the same thing too. Different clothes and hair colour always change up how my eye colour looks.   Sent from my SM-G920W8 using Tapatalk###Thankyou!! This photo is filtered so my eye colour looks tweaked.. but they often change blue/grey/green =

Skipped a line due to unexpected format:  =

Skipped a line due to unexpected format:  ia ju lijepo. Uz to se i lijepo napijea i pomiria sa 



In [2]:
df

Unnamed: 0,type,posts
1,INFJ,"""'http://www.youtube.com/watch?v=qsXHcwe3krw"
2,INFJ,enfp and intj moments https://www.youtube.com...
3,INFJ,What has been the most life-changing experienc...
4,INFJ,http://www.youtube.com/watch?v=vXZeYwwRDw8 h...
5,INFJ,May the PerC Experience immerse you.
...,...,...
412173,INFP,I was going to close my facebook a few months ...
412174,INFP,30 Seconds to Mars - All of my collections. It...
412175,INFP,"I have seen it, and i agree. I did actually th..."
412176,INFP,Ok so i have just watched Underworld 4 (Awaken...


In [3]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['posts'])

sequences = tokenizer.texts_to_sequences(df['posts'])
data = pad_sequences(sequences, maxlen=128)



In [4]:
from sklearn.preprocessing import LabelEncoder
LabelEncoder = LabelEncoder()
labels = df['type'].values
labels = LabelEncoder.fit_transform(labels)

print(labels.shape)
labels

(411479,)


array([8, 8, 8, ..., 9, 9, 9])

In [5]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn

class MBTIDataset(Dataset):
    def __init__(self, data, labels):
        self.data = torch.tensor(data, dtype=torch.long)
        self.labels = torch.tensor(labels, dtype=torch.long)
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

class LSTMModel(nn.Module):
    def __init__(self, num_classes):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=5000, embedding_dim=64)
        self.lstm = nn.LSTM(input_size=64, hidden_size=64, num_layers=1, batch_first=True)
        self.fc = nn.Linear(64, num_classes)
        
    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = self.fc(x[:, -1, :])
        return x


In [6]:
labels
labels.shape

(411479,)

In [10]:
from sklearn.model_selection import KFold
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix
from tqdm import tqdm
import torch

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")
num_classes = 16
num_epochs = 3
batch_size = 10

# Initialize KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)
metrics_summary = {
    'accuracy': [],
    'recall': [],
    'precision': [],
    'f1_score': [],
    'conf_matrix': []
}

# Store metrics
metrics = {
    'accuracy': [],
    'loss': []
}

for fold, (train_index, test_index) in enumerate(kf.split(data)):
    print(f"Fold {fold+1}")            
    X_train, X_test = None, None
    y_train, y_test = None, None
    # Split the data
    X_train, X_test = data[train_index], data[test_index]
    y_train = labels[train_index]
    y_test = labels[test_index]
    # Create datasets
    train_dataset = MBTIDataset(X_train, y_train)
    test_dataset = MBTIDataset(X_test, y_test)

    # Create DataLoaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Initialize the model
    model = LSTMModel(num_classes).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters())

# Train the model
    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0
        progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}', leave=True)
        for inputs, batch_labels in progress_bar:
            #inputs, batch_labels = inputs.to(device), batch_labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, batch_labels)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
            progress_bar.set_postfix({'loss': loss.item()})

    average_epoch_loss = epoch_loss / len(train_loader)
    print(f'labels shape at the end of training: {labels.shape}')
    print(f'End of Epoch {epoch + 1}, Average Loss: {average_epoch_loss:.4f}')

    # Evaluate the model
    model.eval()
    y_pred = []
    y_true = []
    labels2 = torch.tensor(labels, dtype=torch.long)
    with torch.no_grad():
        progress_bar = tqdm(test_loader, desc='Evaluation', leave=True)
        for inputs, labels2 in progress_bar:
            #inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            y_pred.extend(predicted.tolist())
            y_true.extend(labels2.tolist())

            #y_pred.extend(predicted.cpu().numpy())
            #y_true.extend(labels.cpu().numpy())
    
    # Calculate metrics
    acc = accuracy_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred, average='macro')
    prec = precision_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')
    conf_mat = confusion_matrix(y_true, y_pred)

    metrics_summary['accuracy'].append(acc)
    metrics_summary['recall'].append(rec)
    metrics_summary['precision'].append(prec)
    metrics_summary['f1_score'].append(f1)
    metrics_summary['conf_matrix'].append(conf_mat)

    print(f"Fold {fold+1} - Loss: {loss.item()}, Accuracy: {acc}, Recall: {rec}, Precision: {prec}, F1 Score: {f1}")



Fold 1


Epoch 1/3: 100%|██████████| 32919/32919 [09:11<00:00, 59.74it/s, loss=1.51]
Epoch 2/3: 100%|██████████| 32919/32919 [09:30<00:00, 57.75it/s, loss=1.9] 
Epoch 3/3: 100%|██████████| 32919/32919 [09:50<00:00, 55.79it/s, loss=2.08] 


labels shape at the end of training: (411479,)
End of Epoch 3, Average Loss: 2.1149


Evaluation: 100%|██████████| 8230/8230 [00:30<00:00, 266.72it/s]


Fold 1 - Loss: 2.0753893852233887, Accuracy: 0.27987994556236023, Recall: 0.1356334668547586, Precision: 0.33114335641553005, F1 Score: 0.1485429401488319
Fold 2


Epoch 1/3: 100%|██████████| 32919/32919 [09:33<00:00, 57.38it/s, loss=2.94]
Epoch 2/3: 100%|██████████| 32919/32919 [09:22<00:00, 58.51it/s, loss=3.3] 
Epoch 3/3: 100%|██████████| 32919/32919 [09:12<00:00, 59.61it/s, loss=2.11]


labels shape at the end of training: (411479,)
End of Epoch 3, Average Loss: 2.1113


Evaluation: 100%|██████████| 8230/8230 [00:27<00:00, 294.65it/s]


Fold 2 - Loss: 2.106515884399414, Accuracy: 0.279466802760766, Recall: 0.133155581114039, Precision: 0.3330849351254649, F1 Score: 0.1461915081163473
Fold 3


Epoch 1/3: 100%|██████████| 32919/32919 [09:21<00:00, 58.59it/s, loss=3.59]
Epoch 2/3: 100%|██████████| 32919/32919 [09:30<00:00, 57.66it/s, loss=1.52]
Epoch 3/3: 100%|██████████| 32919/32919 [09:17<00:00, 59.08it/s, loss=1.91] 


labels shape at the end of training: (411479,)
End of Epoch 3, Average Loss: 2.1121


Evaluation: 100%|██████████| 8230/8230 [00:29<00:00, 275.02it/s]


Fold 3 - Loss: 1.9079135656356812, Accuracy: 0.2782152230971129, Recall: 0.13917124585354212, Precision: 0.3618244250121998, F1 Score: 0.15643857436147487
Fold 4


Epoch 1/3: 100%|██████████| 32919/32919 [09:33<00:00, 57.38it/s, loss=2.38]
Epoch 2/3: 100%|██████████| 32919/32919 [09:37<00:00, 57.01it/s, loss=2.9] 
Epoch 3/3: 100%|██████████| 32919/32919 [09:14<00:00, 59.39it/s, loss=1.87]


labels shape at the end of training: (411479,)
End of Epoch 3, Average Loss: 2.1130


Evaluation: 100%|██████████| 8230/8230 [00:28<00:00, 290.29it/s]
  _warn_prf(average, modifier, msg_start, len(result))


Fold 4 - Loss: 1.8683291673660278, Accuracy: 0.27934529017206183, Recall: 0.1382239648357379, Precision: 0.35967254815950456, F1 Score: 0.1561806273532424
Fold 5


Epoch 1/3: 100%|██████████| 32919/32919 [09:19<00:00, 58.81it/s, loss=1.99]
Epoch 2/3: 100%|██████████| 32919/32919 [09:20<00:00, 58.72it/s, loss=2.5]  
Epoch 3/3: 100%|██████████| 32919/32919 [09:19<00:00, 58.80it/s, loss=1.95]


labels shape at the end of training: (411479,)
End of Epoch 3, Average Loss: 2.1140


Evaluation: 100%|██████████| 8230/8230 [00:28<00:00, 291.56it/s]


Fold 5 - Loss: 1.9482883214950562, Accuracy: 0.27919071632541465, Recall: 0.1366116787131807, Precision: 0.3411289480754841, F1 Score: 0.14995147846772655


In [11]:
print("Average Metrics Across Folds:")
for key in metrics_summary:
    if key == 'conf_matrix':
        print(f"Average {key}:")
        print(np.mean(np.array(metrics_summary[key]), axis=0))
    else:
        print(f"Average {key}: {np.mean(metrics_summary[key])}")


Average Metrics Across Folds:
Average accuracy: 0.27921959558354315
Average recall: 0.13655918747425166
Average precision: 0.34537084255763667
Average f1_score: 0.15146102568952458
Average conf_matrix:
[[1.38200e+02 4.02000e+01 7.00000e+00 1.34000e+01 2.60000e+00 2.00000e-01
  2.00000e-01 1.20000e+00 1.81000e+02 1.07540e+03 8.22000e+01 2.50400e+02
  3.00000e+00 3.80000e+00 7.20000e+00 8.80000e+00]
 [2.56000e+01 7.16200e+02 2.22000e+01 1.25200e+02 6.40000e+00 1.00000e+00
  1.40000e+00 8.00000e+00 4.87200e+02 3.68300e+03 3.39000e+02 9.18800e+02
  1.14000e+01 1.98000e+01 3.18000e+01 3.40000e+01]
 [1.08000e+01 3.02000e+01 1.48600e+02 5.48000e+01 2.40000e+00 2.00000e-01
  1.00000e+00 5.00000e+00 1.58800e+02 9.75200e+02 2.17800e+02 5.64600e+02
  5.20000e+00 1.02000e+01 7.00000e+00 1.26000e+01]
 [1.24000e+01 1.14600e+02 3.18000e+01 6.77200e+02 4.20000e+00 8.00000e-01
  2.00000e+00 1.54000e+01 4.62400e+02 3.02240e+03 4.90400e+02 1.69760e+03
  2.06000e+01 2.58000e+01 2.40000e+01 2.56000e+01]
 [

In [12]:
print(f'data shape: {data.shape}')
print(f'labels shape: {labels.shape}')
print(type(labels))
print(f'predicted shape: {predicted.shape}')
print(f'batch_labels shape: {batch_labels.shape}')
print(f'outputs shape: {outputs.shape}')

data shape: (411479, 128)
labels shape: (411479,)
<class 'numpy.ndarray'>
predicted shape: torch.Size([5])
batch_labels shape: torch.Size([4])
outputs shape: torch.Size([5, 16])


In [13]:
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)
print("y_true shape:", len(y_true))
print("y_pred shape:", len(y_pred))
print("Data type of X_train:", X_train.dtype)
print("Data type of y_train:", y_train.dtype)

# Assuming use of nn.CrossEntropyLoss
if isinstance(y_train[0], torch.Tensor):
    print("Label tensor type:", y_train[0].dtype)
else:
    print("Label type (should be tensor):", type(y_train[0]))
print(f'Max train_index: {train_index.max()}, Max test_index: {test_index.max()}, labels length: {len(labels)}')
print(f'train_index type: {type(train_index)}, test_index type: {type(test_index)}, labels type: {type(labels)}')
print(f'data shape: {data.shape}')
print(f'labels shape: {labels.shape}')

X_train shape: (329184, 128)
y_train shape: (329184,)
X_test shape: (82295, 128)
y_test shape: (82295,)
y_true shape: 82295
y_pred shape: 82295
Data type of X_train: int32
Data type of y_train: int64
Label type (should be tensor): <class 'numpy.int64'>
Max train_index: 411478, Max test_index: 411473, labels length: 411479
train_index type: <class 'numpy.ndarray'>, test_index type: <class 'numpy.ndarray'>, labels type: <class 'numpy.ndarray'>
data shape: (411479, 128)
labels shape: (411479,)
