In [1]:
import pandas as pd
import os
import numpy as np
import torch
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.mixture import GaussianMixture
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
DATA_FOLDER = 'data'
MODEL_FOLDER = 'models'
EXPERIMENT_FOLDER = 'experiment'

VALUES = ['ACHIEVEMENT', 'BENEVOLENCE', 'CONFORMITY', 'HEDONISM', 'POWER', 'SECURITY', 'SELF-DIRECTION', 'STIMULATION', 'TRADITION', 'UNIVERSALISM']

class ValueDataset(Dataset):
    
    def __init__(self, tokenizer, df): 
        self.scenarios = df['scenario'].values.tolist()
        self.N = df.shape[0]
        
        inp = tokenizer(self.scenarios, return_tensors='pt', padding=True, truncation=True)
        self.input_ids = inp.get('input_ids')
        self.attention_mask = inp.get('attention_mask')
        self.token_type_ids = inp.get('token_type_ids')
        self.target = df['label'].values.tolist()
    
    def __getitem__(self, index):
        return self.input_ids[index], self.attention_mask[index], self.token_type_ids[index], self.target[index]

    def __len__(self):
        return self.N

In [3]:
BATCH_SIZE = 10

# Create tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Create datasets
train_datasets = []    
test_datasets = []
dataset = pd.DataFrame()

for value in VALUES:
    FILE = value + '.csv'
    train_split = .8
    df = pd.read_csv(os.path.join(os.getcwd(), DATA_FOLDER, FILE))
    df["value"] = value
    dataset = pd.concat([dataset,df[["uid", "scenario", "label", "value"]]])

#dataset is a dataframe containing every scenario, their associated value and their label
dataset.describe(include="all")


Unnamed: 0,uid,scenario,label,value
count,21374.0,21374,21374.0,21374
unique,,17965,,10
top,,not backing up my mom,,BENEVOLENCE
freq,,5,,7667
mean,35837.167072,,-0.318471,
std,24540.696705,,0.76105,
min,18.0,,-1.0,
25%,14961.25,,-1.0,
50%,31239.5,,0.0,
75%,54489.75,,0.0,


In [4]:
#lets now prepare the data for training a GMM


#First, remove the 0 label 
dataset = dataset[dataset["label"] != 0]
dataset.describe(include="all")

Unnamed: 0,uid,scenario,label,value
count,14547.0,14547,14547.0,14547
unique,,12758,,10
top,,threatening my dad with theft,,BENEVOLENCE
freq,,5,,5413
mean,34742.433835,,-0.467932,
std,24646.517576,,0.883795,
min,19.0,,-1.0,
25%,13260.5,,-1.0,
50%,29230.0,,-1.0,
75%,53017.5,,1.0,


In [5]:
dataset.isnull().values.any()

False

In [6]:
y = dataset[["label", "value"]]
X = dataset["scenario"]

#BOW vectorizer
print(X)
count_vect = CountVectorizer()
X_vect = count_vect.fit_transform(X).toarray()
print(X_vect)

X_train, X_test, y_train, y_test = train_test_split(X_vect, y, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

0       Tomorrow I will audition my singing in a talen...
1       One day he remembered an important assignment ...
2       It was the longest distance I had ever run bef...
3       I am an extremely talented performer and write...
6       Myles wanted to impress his parents in his bas...
                              ...                        
1316    The problem with people who are in the busines...
1326    I justify my laziness thinking that somewhere ...
1327    I pretend that I believe in astrology to fit i...
1328    I used to make fun of the Mexican kids who cou...
1330    I’ve wanted breast implants for years, but I c...
Name: scenario, Length: 14547, dtype: object
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
(11637, 9155) (2910, 9155) (11637, 2) (2910, 2)


In [7]:
GMM = GaussianMixture(n_components=20)

GMM.fit(X_train, y_train)

y_pred = GMM.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print the results
print(f'Accuracy: {accuracy}')
print(f'F1 score: {f1}')

print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Classification Report:\n{class_report}')

TypeError: '<' not supported between instances of 'str' and 'int'

### Save GMM

In [79]:
MODEL_PATH = os.path.join(os.getcwd(), MODEL_FOLDER, 'GMM')

GMM_NAME = 'new_gmm'

np.save(os.path.join(MODEL_PATH, GMM_NAME + '_weights'), GMM.weights_, allow_pickle=False)
np.save(os.path.join(MODEL_PATH, GMM_NAME + '_means'), GMM.means_, allow_pickle=False)
np.save(os.path.join(MODEL_PATH, GMM_NAME + '_covariances'), GMM.covariances_, allow_pickle=False)

OSError: 1676280500 requested and 985116672 written

### Load GMM

In [None]:
means = np.load(GMM_NAME + '_means.npy')
covar = np.load(GMM_NAME + '_covariances.npy')
LOADED_GMM = GaussianMixture(n_components = len(means), covariance_type='full')
LOADED_GMM.precisions_cholesky_ = np.linalg.cholesky(np.linalg.inv(covar))
LOADED_GMM.weights_ = np.load(GMM_NAME + '_weights.npy')
LOADED_GMM.means_ = means
LOADED_GMM.covariances_ = covar

### For Pierre
- The file sizes are too big for me to save them. Will have to train from scratch if we want to test again. Takes about 45 min
- y_test is in the wrong format to compare to y_pred. y_pred assigns to cluster as seen below
- I have assumed a way to go from value to cluster but I have no idea if it is correct or not

In [80]:
y_pred

array([ 7,  9, 10, ..., 11,  4, 10])

In [81]:
y_test

Unnamed: 0,label,value
1867,-1,BENEVOLENCE
1141,-1,TRADITION
683,1,POWER
634,-1,SECURITY
3113,1,SECURITY
...,...,...
1339,-1,HEDONISM
138,-1,SELF-DIRECTION
709,1,UNIVERSALISM
185,-1,CONFORMITY


In [82]:
y_test_modified = y_test.copy()

# Append not to value column if label column in row is -1
y_test_modified['new_value'] = np.where(y_test_modified['label'] == -1, 'NOT_' + y_test_modified['value'], y_test_modified['value'])

In [83]:
y_test_modified

Unnamed: 0,label,value,new_value
1867,-1,BENEVOLENCE,NOT_BENEVOLENCE
1141,-1,TRADITION,NOT_TRADITION
683,1,POWER,POWER
634,-1,SECURITY,NOT_SECURITY
3113,1,SECURITY,SECURITY
...,...,...,...
1339,-1,HEDONISM,NOT_HEDONISM
138,-1,SELF-DIRECTION,NOT_SELF-DIRECTION
709,1,UNIVERSALISM,UNIVERSALISM
185,-1,CONFORMITY,NOT_CONFORMITY


In [84]:
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, f1_score, classification_report

y_test_new = y_test.copy()
all_values = []
# Without knowing the way that the clustering is done there are multiple ways to order our results.
order = False

for value in y['value'].unique():
    if order:
        all_values.append(value)
        all_values.append('NOT_' + value)
    else:
        all_values.append('NOT_' + value)
        all_values.append(value)


for i, value in enumerate(all_values):
    y_test_modified.loc[y_test_modified['new_value'] == value, 'classification'] = i

# Evaluate the classifier
accuracy = accuracy_score(y_test_modified['classification'], y_pred)
precision = precision_score(y_test_modified['classification'], y_pred, average='weighted')
conf_matrix = confusion_matrix(y_test_modified['classification'], y_pred)
f1 = f1_score(y_test_modified['classification'], y_pred, average='weighted')
class_report = classification_report(y_test_modified['classification'], y_pred)

# Print the results
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'F1 score: {f1}')

print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Classification Report:\n{class_report}')

Accuracy: 0.07491408934707904
Precision: 0.12095073243094732
F1 score: 0.052355273780227474
Confusion Matrix:
[[  0   0   0   0   2   0   0  10   0   3  17  34   0   0   2   0   0   0
    3   0]
 [  0   1   2   2   3   0   0   5   0   2   9  15   0   0   1   0   0   0
    3   2]
 [  0   2  11  42  99   2   1  72   1  35 316 209   2   0  49   0   3   0
   35  22]
 [  0   0   2  12  16   0   1  16   0   2  60  50   0   0  11   0   0   0
    9   6]
 [  0   1   2  13  23   0   9  16   0  10  90  58   0   0  15   0   1   0
    5  13]
 [  0   0   0   0   1   0   1   0   2   0   3   6   0   0   1   0   0   0
    2   1]
 [  0   1   0   4   2   0   0  17   3   3  27  35   0   0   2   0   0   0
    2   0]
 [  0   0   3   7   7   2   0  15   2   2  33  69   1   0   5   0   0   0
   11   7]
 [  0   0   1   1   1   2   0  11   0   2   9  20   0   0   1   0   0   0
    3   1]
 [  0   0   1   2  12   1   1   5   0   1  29  22   1   0   4   0   1   0
    4   6]
 [  0   0   2  10  35   4   1  25   3  1

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
