# Author Profiling (Big 5)

We begin by importing required libraries and installing NLP modules.

In [30]:
import nltk
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize, sent_tokenize
import numpy as np
import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel

In [2]:
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt_tab to /home/aarush/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

## Data Preprocessing

Load the labelled essays and emotion lexicon datasets as Pandas dataframes.

In [3]:
essays = pd.read_csv("data/essays.csv").iloc[:, 1:]
lexicon = pd.read_csv("data/lexicon.csv")

print(essays.head())
print(lexicon.head())

                                                TEXT cEXT cNEU cAGR cCON cOPN
0  Well, right now I just woke up from a mid-day ...    n    y    y    n    y
1  Well, here we go with the stream of consciousn...    n    n    y    n    n
2  An open keyboard and buttons to push. The thin...    n    y    n    y    y
3  I can't believe it!  It's really happening!  M...    y    n    y    y    n
4  Well, here I go with the good old stream of co...    y    n    y    n    y
    Words  anger  anticipation  disgust  fear  joy  negative  positive  \
0   march      0             0        0     0    0         0         1   
1  august      0             0        0     0    0         0         1   
2     ago      0             0        0     0    0         0         0   
3     mar      0             0        0     0    0         1         0   
4     vie      0             0        0     0    0         0         0   

   sadness  surprise  trust  Charged  
0        0         0      0        1  
1        

Construct a set of all the words in the emotion lexicon.

In [1201]:
emotion_words = set(lexicon["Words"])
print(list(emotion_words)[:50])

['forsaken', 'frenetic', 'follow', 'routine', 'boob', 'monologue', 'delicious', 'dismay', 'depositary', 'splendid', 'sentry', 'tuition', 'vinaigrette', 'poetical', 'hen', 'payment', 'huff', 'rope', 'sacrifices', 'gig', 'canary', 'disprove', 'haven', 'mitigation', 'foray', 'pious', 'back', 'alphabetical', 'letters', 'vocation', 'mixed', 'mosaic', 'buttress', 'chaise', 'prescriptive', 'upstart', 'lanky', 'fright', 'prescription', 'rollicking', 'ridiculous', 'accordance', 'gush', 'irrational', 'existent', 'method', 'legislative', 'fact', 'flanking', 'jog']


### Filter Sentences

For each entry in the dataset, we will tokenize the text by sentence and drop any sentences that do not contain a word from the emotion lexicon - we deem these sentences to be too mild for drawing inferences from. 

In [524]:
def filter_sentences(text, emotion_words):
	sentences = sent_tokenize(text)
	filtered_sentences = [sentence for sentence in sentences if any(word in emotion_words for word in word_tokenize(sentence))]
	return ' '.join(filtered_sentences)

In [6]:
essays["TEXT"] = essays["TEXT"].apply(lambda x: filter_sentences(x, emotion_words))
essays = essays[essays["TEXT"].str.strip() != ""]

print(essays.head())

                                                TEXT cEXT cNEU cAGR cCON cOPN
0  Well, right now I just woke up from a mid-day ...    n    y    y    n    y
1  Well, here we go with the stream of consciousn...    n    n    y    n    n
2  An open keyboard and buttons to push. The thin...    n    y    n    y    y
3  It's really happening! My pulse is racing like...    y    n    y    y    n
4  Well, here I go with the good old stream of co...    y    n    y    n    y


## Feature Extraction


### TF-IDF Vectorization

Perform Term Frequency-Inverse Document Frequency vectorization on each text.

In [7]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")
tfidf_features = tfidf_vectorizer.fit_transform(essays["TEXT"]).toarray()

print(tfidf_features)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


### Sentiment Vectors

Convert the emotion lexicon to a dictionary.

In [8]:
lexicon_dict = {
    row["Words"]: np.array(row[1:])  # Assuming emotions are in columns 1 onward
    for _, row in lexicon.iterrows()
}

print(lexicon_dict["march"])

[0 0 0 0 0 0 1 0 0 0 1]


Next, create a function to compute the cumulative sentiment vector of the text, and apply this function to the corpus to generate sentiment vector features.

In [None]:
def compute_sentiment_vector(text, lexicon_dict):
	tokens = text.split()
	
	sentiment_vector = np.zeros(len(lexicon_dict["march"]))
	
	for token in tokens:
		if token in lexicon_dict:
			sentiment_vector += lexicon_dict[token].astype(float)

	if np.sum(sentiment_vector) > 0: sentiment_vector /= np.sum(sentiment_vector)
	
	return sentiment_vector

In [336]:
sentiment_vectors = np.array([compute_sentiment_vector(text, lexicon_dict) for text in essays["TEXT"]])
print(sentiment_vectors)

[[ 3. 18.  0. ...  5. 12. 40.]
 [ 4. 20.  1. ... 10. 15. 53.]
 [ 6. 12.  5. ...  2. 11. 52.]
 ...
 [ 6. 10.  5. ...  5.  7. 38.]
 [ 9. 28.  4. ...  9. 23. 68.]
 [13. 31.  9. ... 22. 31. 72.]]


### GloVe Word Embeddings

Function to load embeddings into a dictionary.

In [11]:
def load_glove_embeddings(filepath):
	embeddings = {}
	
	with open(filepath, 'r', encoding='utf-8') as f:
		for line in f:
			values = line.split()
			word = values[0]
			vector = np.array(values[1:], dtype="float32")
			embeddings[word] = vector
	
	return embeddings

Load embeddings from the 6B token 200 vector dimension model.

In [158]:
glove_embeddings = load_glove_embeddings("models/glove.6B.200d.txt")
print(list(glove_embeddings.keys())[:100])
print(glove_embeddings["the"])

['the', ',', '.', 'of', 'to', 'and', 'in', 'a', '"', "'s", 'for', '-', 'that', 'on', 'is', 'was', 'said', 'with', 'he', 'as', 'it', 'by', 'at', '(', ')', 'from', 'his', "''", '``', 'an', 'be', 'has', 'are', 'have', 'but', 'were', 'not', 'this', 'who', 'they', 'had', 'i', 'which', 'will', 'their', ':', 'or', 'its', 'one', 'after', 'new', 'been', 'also', 'we', 'would', 'two', 'more', "'", 'first', 'about', 'up', 'when', 'year', 'there', 'all', '--', 'out', 'she', 'other', 'people', "n't", 'her', 'percent', 'than', 'over', 'into', 'last', 'some', 'government', 'time', '$', 'you', 'years', 'if', 'no', 'world', 'can', 'three', 'do', ';', 'president', 'only', 'state', 'million', 'could', 'us', 'most', '_', 'against', 'u.s.']
[-7.1549e-02  9.3459e-02  2.3738e-02 -9.0339e-02  5.6123e-02  3.2547e-01
 -3.9796e-01 -9.2139e-02  6.1181e-02 -1.8950e-01  1.3061e-01  1.4349e-01
  1.1479e-02  3.8158e-01  5.4030e-01 -1.4088e-01  2.4315e-01  2.3036e-01
 -5.5339e-01  4.8154e-02  4.5662e-01  3.2338e+00  2.

Build a feature set of glove embeddings for each text by taking the mean of the vector corresponding to each word in the text.

In [525]:
def build_glove_embedding(text, embeddings):
	words = word_tokenize(text)
	
	word_vectors = [embeddings[word] for word in words if word in embeddings]
	
	if word_vectors: return np.mean(word_vectors, axis=0)
	else: return np.zeros(50)  # match glove dimension

In [526]:
glove_embeddings = np.array([build_glove_embedding(text, glove_embeddings) for text in essays["TEXT"]])
print(glove_embeddings)

[[ 0.17561968  0.21697745 -0.09997822 ...  0.17866525 -0.13878608
   0.08554857]
 [ 0.19505993  0.24416132 -0.11080451 ...  0.15697885 -0.10536592
   0.07854819]
 [ 0.15996172  0.21897827 -0.05512039 ...  0.16525042 -0.14665474
   0.08982378]
 ...
 [ 0.14254373  0.23699257 -0.10872305 ...  0.15260881 -0.12059193
   0.06254157]
 [ 0.20047392  0.22909714 -0.09466812 ...  0.147249   -0.13642329
   0.10019936]
 [ 0.23306464  0.22932526 -0.08814638 ...  0.18132985 -0.1229706
   0.09198   ]]


### BERT Word Embeddings

Load the BERT models.

In [16]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = TFBertModel.from_pretrained("bert-base-uncased")

2024-12-03 13:07:50.312874: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of 

Process the corpus in contextual batches of 32 and generate BERT word embeddings.

In [261]:
def get_bert_embeddings_batched(texts, batch_size=32):
    embeddings = []
    
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors="tf", padding=True, truncation=True, max_length=128)
        outputs = bert_model(inputs["input_ids"])
        batch_embeddings = tf.reduce_mean(outputs.last_hidden_state, axis=1).numpy()
        embeddings.append(batch_embeddings)
    
    return np.vstack(embeddings)

In [262]:
bert_embeddings = get_bert_embeddings_batched(list(essays["TEXT"]))
print(bert_embeddings)

[[-0.00908303  0.01011803  0.18886453 ... -0.04336559  0.18032753
   0.07847334]
 [ 0.03351337  0.13423398  0.15558639 ...  0.18691523 -0.03201956
   0.09959432]
 [ 0.0451029   0.27323285  0.4640565  ... -0.02578287  0.2341059
   0.08627795]
 ...
 [-0.13311921 -0.2170437   0.18040407 ... -0.00803733  0.22940615
  -0.14817858]
 [ 0.25195867 -0.02217607  0.19356966 ... -0.09765543  0.07215264
  -0.15513891]
 [ 0.07322188  0.07545072  0.20285086 ...  0.08045725  0.11770028
   0.15996607]]


### Final Feature Vector

Combine features into a final feature vector, and separate targets.

In [1189]:
features = np.hstack([tfidf_features, sentiment_vectors, glove_embeddings])
targets = (essays[["cEXT", "cNEU", "cAGR", "cCON", "cOPN"]].values == "y").astype(int)

print(features)
print(targets)

[[ 0.          0.          0.         ...  0.17866525 -0.13878608
   0.08554857]
 [ 0.          0.          0.         ...  0.15697885 -0.10536592
   0.07854819]
 [ 0.          0.          0.         ...  0.16525042 -0.14665474
   0.08982378]
 ...
 [ 0.          0.          0.         ...  0.15260881 -0.12059193
   0.06254157]
 [ 0.          0.          0.         ...  0.147249   -0.13642329
   0.10019936]
 [ 0.          0.          0.         ...  0.18132985 -0.1229706
   0.09198   ]]
[[0 1 1 0 1]
 [0 0 1 0 0]
 [0 1 0 1 1]
 ...
 [0 0 1 0 0]
 [0 1 0 0 1]
 [0 1 1 0 1]]


## Principal Component Analysis

Perform PCA to bring down the (very high) dimension of the current feature vector.

In [1198]:
if features.shape[1] > 256:
	pca = PCA(n_components=256)
	features = pca.fit_transform(features)

## Model Training

### Test Train Split

Create a 20-80 test-train split.

In [1199]:
X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.2, random_state=42)

In [1192]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Building the Neural Network

Build a fully-connected feed-forward neural network with 5 hidden layers, and L2 regularization with strength 0.001. Define a neuron dropout fraction of 0.4 per layer. 

In [1193]:
model = tf.keras.Sequential([
	tf.keras.layers.InputLayer(input_shape=(X_train.shape[1],)),
	# tf.keras.layers.Dense(512, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(0.001)),
	# tf.keras.layers.BatchNormalization(),
	# tf.keras.layers.Dropout(0.4),
	tf.keras.layers.Dense(256, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(0.001)),
	tf.keras.layers.BatchNormalization(),
	tf.keras.layers.Dropout(0.4),
	tf.keras.layers.Dense(128, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(0.001)),
	tf.keras.layers.BatchNormalization(),
	tf.keras.layers.Dropout(0.4),
	tf.keras.layers.Dense(64, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(0.001)),
	tf.keras.layers.BatchNormalization(),
	tf.keras.layers.Dropout(0.4),
	tf.keras.layers.Dense(32, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(0.001)),
	tf.keras.layers.BatchNormalization(),
	tf.keras.layers.Dropout(0.4),
	tf.keras.layers.Dense(16, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(0.001)),
	tf.keras.layers.BatchNormalization(),
	tf.keras.layers.Dropout(0.4),
	tf.keras.layers.Dense(y_train.shape[1], activation="sigmoid")
])

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

### Evaluation

Define some functions to evaluate the model's predictions.

In [1195]:
def hamming_loss(y_true, y_pred):
    total_labels = y_true.size
    incorrect_labels = (y_true != y_pred).sum()
    return incorrect_labels / total_labels

def exact_match_accuracy(y_true, y_pred):
    matches = (y_true == y_pred).all(axis=1).sum()
    total_samples = y_true.shape[0]
    return matches / total_samples

def label_wise_accuracy(y_true, y_pred):
    per_label_accuracy = (y_true == y_pred).mean(axis=0)
    return per_label_accuracy

### Training

Define an early stopping callback, and train the model ten times, evaluate exact match accuracy for each iteration.

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3)

In [1196]:
for i in range(10):
	model.fit(
		X_train, y_train, 
		epochs=50, 
		validation_split=0.2, 
		callbacks=[early_stopping],
		verbose=0
	)

	y_pred = (model.predict(X_test) > 0.5).astype(int)
	
	print(exact_match_accuracy(y_test, y_pred))

0.042682926829268296
0.04878048780487805
0.06707317073170732
0.06504065040650407
0.07113821138211382
0.07723577235772358
0.07113821138211382
0.07520325203252033
0.07520325203252033
0.09146341463414634


Evaluate the last trained model in detail.

In [1197]:
print("Hamming Loss:", hamming_loss(y_test, y_pred))
print("Exact Match Accuracy:", exact_match_accuracy(y_test, y_pred))
print("Label-Wise Accuracy:", label_wise_accuracy(y_test, y_pred))

Hamming Loss: 0.44715447154471544
Exact Match Accuracy: 0.09146341463414634
Label-Wise Accuracy: [0.58130081 0.53861789 0.51422764 0.54471545 0.58536585]
