In [1]:
import pandas as pd

# Load the CSV data
data = pd.read_csv('../data/500_Reddit_users_posts_labels.csv')

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Concatenate post lists into a single string for each user
data['Post'] = data['Post'].apply(lambda x: ' '.join(eval(x)))

# Check if the 'Post' column is empty after preprocessing
if data['Post'].str.strip().eq('').any():
    raise ValueError("Some posts are empty after preprocessing.")

# Initialize TF-IDF vectorizer
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')

# Fit and transform the posts
try:
    tfidf_features = tfidf.fit_transform(data['Post'])
except ValueError as e:
    print(f"Error: {e}")
    print("Check the preprocessing steps and ensure the documents contain valid words.")

SyntaxError: unterminated string literal (detected at line 1) (<string>, line 1)

In [3]:
from openai import OpenAI
import openai
from dotenv import load_dotenv
import os

print("Loaded env file: ", load_dotenv("../.env", override=True))

openai.api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI()

# Function to get embeddings for each post
def get_embeddings(post):
    response = client.embeddings.create(
        input=post,
        model="text-embedding-3-large",
    )
    return response.data[0].embedding

# Apply the embedding function to each concatenated post
data['embeddings'] = data['Post'].apply(lambda x: get_embeddings(x))

# Convert the embeddings to a matrix form (numpy array)
import numpy as np
embeddings_matrix = np.array(data['embeddings'].tolist())

Loaded env file:  True


In [4]:
print("Embeddings matrix shape:", embeddings_matrix.shape)

Embeddings matrix shape: (528, 3072)


In [5]:
from sklearn.utils import compute_class_weight
from sklearn.preprocessing import LabelEncoder

# Encode labels to numeric form
le = LabelEncoder()
y_encoded = le.fit_transform(data['Label'])

# Print unique values in y_encoded to ensure correct mapping
print("Encoded labels:", np.unique(y_encoded))

# Recompute class weights based on the encoded labels
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_encoded),
    y=y_encoded
)

# Convert to dictionary
class_weights_dict = {i: weight for i, weight in enumerate(class_weights)}
print("Class weights dictionary:", class_weights_dict)


Encoded labels: [0 1 2 3 4 5]
Class weights dictionary: {0: 2.0952380952380953, 1: 1.2054794520547945, 2: 0.5333333333333333, 3: 0.946236559139785, 4: 0.8888888888888888, 5: 1.5714285714285714}


In [6]:
# Check for NaN or infinite values in embeddings matrix
print("NaN values in embeddings:", np.isnan(embeddings_matrix).sum())
print("Infinite values in embeddings:", np.isinf(embeddings_matrix).sum())

# Ensure no rows are empty
assert embeddings_matrix.shape[0] == len(data), "Mismatch in embeddings and data size."

NaN values in embeddings: 0
Infinite values in embeddings: 0


In [16]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2


# Encode labels to numeric form
le = LabelEncoder()
y_encoded = le.fit_transform(data['Label'])

# Split data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(embeddings_matrix, y_encoded, test_size=0.2, random_state=42)


y_train = np.array(y_train)
y_val = np.array(y_val)

# Define the input shape based on the shape of the embeddings (dimensionality of embedding)
input_shape = X_train.shape[1:]


# Model definition with reduced complexity and regularization
model = Sequential([
    Input(shape=input_shape),
    Dense(8096, activation='relu', kernel_regularizer=l2(0.01)),
    Dropout(0.3),
    Dense(8096, activation='relu', kernel_regularizer=l2(0.01)),
    Dropout(0.3),
    Dense(8096, activation='relu', kernel_regularizer=l2(0.01)),
    Dropout(0.3),
    Dense(8096, activation='relu', kernel_regularizer=l2(0.01)),
    Dropout(0.3),
    Dense(2048, activation='relu', kernel_regularizer=l2(0.01)),
    Dropout(0.3),
    Dense(6, activation='softmax')  # Assuming 6 output classes
])

# Compile the model with a lower learning rate
model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Implement early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model with validation, class weights, and early stopping
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=30, batch_size=32, class_weight=class_weights_dict, callbacks=[early_stopping])

Epoch 1/30
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 4s/step - accuracy: 0.2240 - loss: 207.1782 - val_accuracy: 0.0660 - val_loss: 19.9893
Epoch 2/30
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 3s/step - accuracy: 0.1056 - loss: 13.2783 - val_accuracy: 0.0943 - val_loss: 8.4764
Epoch 3/30
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 3s/step - accuracy: 0.1128 - loss: 7.4065 - val_accuracy: 0.0660 - val_loss: 3.4570
Epoch 4/30
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 3s/step - accuracy: 0.0875 - loss: 2.9042 - val_accuracy: 0.1321 - val_loss: 2.1699
Epoch 5/30
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 3s/step - accuracy: 0.1538 - loss: 2.1143 - val_accuracy: 0.0943 - val_loss: 1.8890
Epoch 6/30
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 3s/step - accuracy: 0.1138 - loss: 1.8976 - val_accuracy: 0.0660 - val_loss: 1.8188
Epoch 7/30
[1m14/14[0m [32m━━━━━━

Exception ignored in: <function WeakKeyDictionary.__init__.<locals>.remove at 0x000002A089B0E020>
Traceback (most recent call last):
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\Lib\weakref.py", line 370, in remove
    self = selfref()
           ^^^^^^^^^
KeyboardInterrupt: 


KeyboardInterrupt: 

In [None]:
# Save the model
model.save('../models/nn_model.h5')

In [13]:
from sklearn.metrics import classification_report, confusion_matrix

# Predictions on the validation set
y_pred_val = model.predict(X_val)
y_pred_classes = np.argmax(y_pred_val, axis=1)

# Classification report
print(classification_report(y_val, y_pred_classes))

# Confusion matrix
print(confusion_matrix(y_val, y_pred_classes))


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         7
           1       0.44      0.57      0.50        14
           2       0.46      0.38      0.42        34
           3       0.18      0.10      0.13        20
           4       0.46      0.76      0.57        21
           5       0.83      1.00      0.91        10

    accuracy                           0.46       106
   macro avg       0.40      0.47      0.42       106
weighted avg       0.41      0.46      0.42       106

[[ 0  2  5  0  0  0]
 [ 0  8  3  0  3  0]
 [ 1  8 13  5  6  1]
 [ 1  0  7  2 10  0]
 [ 0  0  0  4 16  1]
 [ 0  0  0  0  0 10]]
