In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import json
import pandas as pd
import re

# Load twitter data
with open('/content/drive/MyDrive/THESIS/twisty-labels.json', 'r') as file:
    labels_data = json.load(file)
labels_list = [{'user_id': key, **value} for key, value in labels_data.items()]
labels_df = pd.DataFrame(labels_list)

tweets_df = pd.read_json('/content/drive/MyDrive/THESIS/twisty-msg.jsonl', lines=True)
tweets_df['user_id'] = tweets_df['user_id'].astype(str)
labels_df['user_id'] = labels_df['user_id'].astype(str)

merged_df = pd.merge(tweets_df, labels_df, on='user_id', how='inner')
merged_df = merged_df[~merged_df['tweet_text'].str.startswith("RT")]
merged_df['clean_tweet'] = merged_df['tweet_text'].apply(lambda x: re.sub(r'@\w+|http\S+', '', x).lower()) # removing URLs and user mentions

# Ensure unique users are preserved during sampling
unique_users = merged_df['user_id'].unique()
sample_users = pd.Series(unique_users).sample(frac=0.1, random_state=42)
sample_df = merged_df[merged_df['user_id'].isin(sample_users)]

# Function to clean text data
def clean_text(text):
    text = re.sub(r'[\W\d]', ' ', text.lower())  # Remove non-words and digits
    return text.strip()

#load reddit data
file_paths = [
    '/content/drive/MyDrive/THESIS/extrovert_introvert.csv',
    '/content/drive/MyDrive/THESIS/sensing_intuitive.csv',
    '/content/drive/MyDrive/THESIS/feeling_thinking.csv',
    '/content/drive/MyDrive/THESIS/judging_perceiving.csv'
]

reddit_gender = pd.read_csv('/content/drive/MyDrive/THESIS/gender.csv')
reddit_gender['clean_text'] = reddit_gender['post'].apply(clean_text)


reddit_dfs = []
for file_path in file_paths:
    reddit_df = pd.read_csv(file_path)
    reddit_df['clean_text'] = reddit_df['post'].apply(clean_text)
    reddit_dfs.append(reddit_df)



USING TWITTER AS TRAINING - REDDIT AS TEST

In [None]:
import logging
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline

# Enable logging
logging.basicConfig(level=logging.INFO)

#use twitter as training
X_train = sample_df['clean_tweet']
y_train = sample_df['gender'].map({'M':0, 'F': 1})

#use reddit as test
X_test = reddit_gender['clean_text']
y_test = reddit_gender['female']

# Define the feature union
feature_union = FeatureUnion([
    ('word_tfidf', TfidfVectorizer(analyzer='word', max_features=1000, ngram_range=(1, 3))),
    ('char_tfidf', TfidfVectorizer(analyzer='char', max_features=1000, ngram_range=(2, 5)))
])

# Define pipelines
pipelines = {
    'lr': Pipeline([
        ('tfidf', feature_union),
        ('lr', LogisticRegression(random_state=42, class_weight='balanced'))
    ]),
    'rf': Pipeline([
        ('tfidf', feature_union),
        ('rf', RandomForestClassifier(random_state=42, class_weight='balanced'))
    ]),
    'svc': Pipeline([
        ('tfidf', feature_union),
        ('svc', LinearSVC(random_state=42, class_weight='balanced'))
    ])
}

# Define parameter grids
param_grids = {
    'lr': {
        'tfidf__word_tfidf__ngram_range': [(1, 2), (1, 3)],
        'tfidf__char_tfidf__ngram_range': [(2, 4), (2, 5)],
        'lr__C': [0.1, 1, 10],
        'lr__penalty': ['l2']
    },
    'rf': {
        'tfidf__word_tfidf__ngram_range': [(1, 2), (1, 3)],
        'tfidf__char_tfidf__ngram_range': [(2, 4), (2, 5)],
        'rf__n_estimators': [100, 200],
        'rf__max_depth': [20, None]
    },
    'svc': {
        'tfidf__word_tfidf__ngram_range': [(1, 2), (1, 3)],
        'tfidf__char_tfidf__ngram_range': [(2, 4), (2, 5)],
        'svc__C': [0.1, 1, 10],
        'svc__penalty': ['l2']
    }
}

# Perform Grid Search for each model
best_models = {}
for model_name in pipelines:
    print(f"Performing Grid Search for {model_name.upper()}...")
    grid_search = GridSearchCV(pipelines[model_name], param_grids[model_name], cv=3, verbose=3, scoring='accuracy', n_jobs=8)
    grid_search.fit(X_train, y_train)

    # Save the best model and its parameters
    best_models[model_name] = grid_search.best_estimator_
    print(f"Best Score for {model_name.upper()}: ", grid_search.best_score_)
    print(f"Best Parameters for {model_name.upper()}: ", grid_search.best_params_)

    # Evaluate the model
    y_pred = grid_search.predict(X_test)
    print(f"Classification Report for {model_name.upper()}:\n", classification_report(y_test, y_pred))
    print(f"Confusion Matrix for {model_name.upper()}:\n", confusion_matrix(y_test, y_pred))

logging.info('GridSearchCV for all models complete.')

Performing Grid Search for LR...
Fitting 3 folds for each of 12 candidates, totalling 36 fits
Best Score for LR:  0.5566104577015728
Best Parameters for LR:  {'lr__C': 0.1, 'lr__penalty': 'l2', 'tfidf__char_tfidf__ngram_range': (2, 4), 'tfidf__word_tfidf__ngram_range': (1, 2)}
Classification Report for LR:
               precision    recall  f1-score   support

           0       0.80      0.45      0.58     23777
           1       0.58      0.87      0.70     20858

    accuracy                           0.65     44635
   macro avg       0.69      0.66      0.64     44635
weighted avg       0.70      0.65      0.63     44635

Confusion Matrix for LR:
 [[10780 12997]
 [ 2726 18132]]
Performing Grid Search for RF...
Fitting 3 folds for each of 16 candidates, totalling 48 fits
Best Score for RF:  0.6114496244863257
Best Parameters for RF:  {'rf__max_depth': None, 'rf__n_estimators': 200, 'tfidf__char_tfidf__ngram_range': (2, 5), 'tfidf__word_tfidf__ngram_range': (1, 3)}
Classification R

In [None]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification, DefaultDataCollator
from datasets import Dataset

# BERT Model

#use twitter as training
X_train = sample_df['clean_tweet']
y_train = sample_df['gender'].map({'M':0, 'F': 1})

#use reddit as test
X_test = reddit_gender['clean_text']
y_test = reddit_gender['female']

# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize Twitter data
tokenized_train = tokenizer(X_train.tolist(), padding=True, truncation=True, max_length=128, return_tensors="tf")

# Tokenize Reddit data
tokenized_reddit = tokenizer(X_test.tolist(), padding=True, truncation=True, max_length=128, return_tensors="tf")

# Prepare datasets
train_dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids': tokenized_train['input_ids'], 'attention_mask': tokenized_train['attention_mask']}, y_train
)).batch(32)

val_dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids': tokenized_reddit['input_ids'], 'attention_mask': tokenized_reddit['attention_mask']}, y_test
)).batch(32)

# Load and compile the BERT model
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

# Train the BERT model
model.fit(train_dataset, validation_data=val_dataset, epochs=3)

# Evaluate the BERT model on Reddit data
bert_results = model.evaluate(val_dataset)
print(f"BERT Results: {bert_results}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


Cause: for/else statement not yet supported


Cause: for/else statement not yet supported
Epoch 2/3
Epoch 3/3
BERT Results: [1.1429332494735718, 0.5547888278961182]


In [None]:
# TWITTER TRAIN, REDDIT TEST for MBTI
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf
from datasets import Dataset

# Split MBTI into dimensions and create binary labels for twitter
sample_df['EI'] = sample_df['mbti'].apply(lambda x: 1 if x[0] == 'E' else 0)
sample_df['SN'] = sample_df['mbti'].apply(lambda x: 1 if x[1] == 'N' else 0)
sample_df['TF'] = sample_df['mbti'].apply(lambda x: 1 if x[2] == 'T' else 0)
sample_df['JP'] = sample_df['mbti'].apply(lambda x: 1 if x[3] == 'P' else 0)

# Dimensions
dimensions = ['EI', 'SN', 'TF', 'JP']

# Define the feature union
feature_union = FeatureUnion([
    ('word_tfidf', TfidfVectorizer(analyzer='word', max_features=1000, ngram_range=(1, 2))),
    ('char_tfidf', TfidfVectorizer(analyzer='char', max_features=1000, ngram_range=(2, 4)))
])

# Define pipelines
pipelines = {
    'lr': Pipeline([
        ('tfidf', feature_union),
        ('lr', LogisticRegression(random_state=42, class_weight='balanced', n_jobs=-1))
    ]),
    'rf': Pipeline([
        ('tfidf', feature_union),
        ('rf', RandomForestClassifier(random_state=42, class_weight='balanced', n_jobs=-1))
    ]),
    'svc': Pipeline([
        ('tfidf', feature_union),
        ('svc', LinearSVC(random_state=42, class_weight='balanced'))
    ])
}

# Train and evaluate traditional ML models
for dimension, reddit_df in zip(dimensions, reddit_dfs):
    print(f"Training and evaluating for dimension: {dimension}")

    # Twitter training data
    X_train = sample_df['clean_tweet']
    y_train = sample_df[dimension]

    # Reddit testing data
    X_test = reddit_df['clean_text']
    y_test = reddit_df.iloc[:, 2]

    for model_name, pipeline in pipelines.items():
        print(f"Training {model_name} on dimension {dimension}")

        # Train the model
        pipeline.fit(X_train, y_train)

        # Predict and evaluate
        y_pred = pipeline.predict(X_test)
        print(f"Results for {model_name} on {dimension} dimension:")
        print(classification_report(y_test, y_pred))

# BERT Model
# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Train and evaluate BERT for each dimension
for dimension in dimensions:
    print(f"Training and evaluating BERT for dimension: {dimension}")

    # Twitter training data
    X_train = sample_df['clean_tweet']
    y_train = sample_df[dimension]

    # Tokenize Twitter data
    tokenized_train = tokenizer(X_train.tolist(), padding=True, truncation=True, max_length=128, return_tensors="tf")

    # Prepare the training dataset
    train_dataset = tf.data.Dataset.from_tensor_slices((
        {'input_ids': tokenized_train['input_ids'], 'attention_mask': tokenized_train['attention_mask']}, y_train
    )).batch(32)

    # Train and evaluate on Reddit data
    for reddit_df in reddit_dfs:
        # Reddit test data
        X_test = reddit_df['clean_text']
        y_test = reddit_df.iloc[:, 2]

        # Tokenize Reddit data
        tokenized_test = tokenizer(X_test.tolist(), padding=True, truncation=True, max_length=128, return_tensors="tf")

        # Prepare the test dataset
        test_dataset = tf.data.Dataset.from_tensor_slices((
            {'input_ids': tokenized_test['input_ids'], 'attention_mask': tokenized_test['attention_mask']}, y_test
        )).batch(32)

        # Load and compile the BERT model
        model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
                      loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                      metrics=['accuracy'])

        # Train the BERT model
        model.fit(train_dataset, epochs=3)

        # Evaluate the BERT model on Reddit data
        bert_results = model.evaluate(test_dataset)
        print(f"BERT Results for {dimension} dimension on Reddit data: {bert_results}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_df['EI'] = sample_df['mbti'].apply(lambda x: 1 if x[0] == 'E' else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_df['SN'] = sample_df['mbti'].apply(lambda x: 1 if x[1] == 'N' else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_df['TF'] = sample_df['mbti'].apply(lambda x:

Training and evaluating for dimension: EI
Training lr on dimension EI


  pid = os.fork()


USING REDDIT AS TRAINING, TWITTER AS TEST

In [None]:
###### GENDER
import logging
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline

# Enable logging
logging.basicConfig(level=logging.INFO)

#use reddit as training
X_train = reddit_gender['clean_text']
y_train = reddit_gender['female']

#use twitter as test
X_test = sample_df['clean_tweet']
y_test = sample_df['gender'].map({'M':0, 'F': 1})

# Define the feature union
feature_union = FeatureUnion([
    ('word_tfidf', TfidfVectorizer(analyzer='word', max_features=1000, ngram_range=(1, 3))),
    ('char_tfidf', TfidfVectorizer(analyzer='char', max_features=1000, ngram_range=(2, 5)))
])

# Define pipelines
pipelines = {
    'lr': Pipeline([
        ('tfidf', feature_union),
        ('lr', LogisticRegression(random_state=42, class_weight='balanced'))
    ]),
    'rf': Pipeline([
        ('tfidf', feature_union),
        ('rf', RandomForestClassifier(random_state=42, class_weight='balanced'))
    ]),
    'svc': Pipeline([
        ('tfidf', feature_union),
        ('svc', LinearSVC(random_state=42, class_weight='balanced'))
    ])
}

# Define parameter grids
param_grids = {
    'lr': {
        'tfidf__word_tfidf__ngram_range': [(1, 2), (1, 3)],
        'tfidf__char_tfidf__ngram_range': [(2, 4), (2, 5)],
        'lr__C': [0.1, 1, 10],
        'lr__penalty': ['l2']
    },
    'rf': {
        'tfidf__word_tfidf__ngram_range': [(1, 2), (1, 3)],
        'tfidf__char_tfidf__ngram_range': [(2, 4), (2, 5)],
        'rf__n_estimators': [100, 200],
        'rf__max_depth': [20, None]
    },
    'svc': {
        'tfidf__word_tfidf__ngram_range': [(1, 2), (1, 3)],
        'tfidf__char_tfidf__ngram_range': [(2, 4), (2, 5)],
        'svc__C': [0.1, 1, 10],
        'svc__penalty': ['l2']
    }
}

# Perform Grid Search for each model
best_models = {}
for model_name in pipelines:
    print(f"Performing Grid Search for {model_name.upper()}...")
    grid_search = GridSearchCV(pipelines[model_name], param_grids[model_name], cv=3, verbose=3, scoring='accuracy', n_jobs=8)
    grid_search.fit(X_train, y_train)

    # Save the best model and its parameters
    best_models[model_name] = grid_search.best_estimator_
    print(f"Best Score for {model_name.upper()}: ", grid_search.best_score_)
    print(f"Best Parameters for {model_name.upper()}: ", grid_search.best_params_)

    # Evaluate the model
    y_pred = grid_search.predict(X_test)
    print(f"Classification Report for {model_name.upper()}:\n", classification_report(y_test, y_pred))
    print(f"Confusion Matrix for {model_name.upper()}:\n", confusion_matrix(y_test, y_pred))

logging.info('GridSearchCV for all models complete.')

In [None]:
#### GENDER
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification, DefaultDataCollator
from datasets import Dataset

# BERT Model

#use REDDIT as training
X_train = reddit_gender['clean_text']
y_train = reddit_gender['female']

#use TWITTER as test
X_test = sample_df['clean_tweet']
y_test = sample_df['gender'].map({'M':0, 'F': 1})

# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize Twitter data
tokenized_train = tokenizer(X_train.tolist(), padding=True, truncation=True, max_length=128, return_tensors="tf")

# Tokenize Reddit data
tokenized_reddit = tokenizer(X_test.tolist(), padding=True, truncation=True, max_length=128, return_tensors="tf")

# Prepare datasets
train_dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids': tokenized_train['input_ids'], 'attention_mask': tokenized_train['attention_mask']}, y_train
)).batch(32)

val_dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids': tokenized_reddit['input_ids'], 'attention_mask': tokenized_reddit['attention_mask']}, y_test
)).batch(32)

# Load and compile the BERT model
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

# Train the BERT model
model.fit(train_dataset, validation_data=val_dataset, epochs=3)

# Evaluate the BERT model on Reddit data
bert_results = model.evaluate(val_dataset)
print(f"BERT Results: {bert_results}")

In [None]:
# REDDIT TRAIN, TWITTER TEST for MBTI
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf
from datasets import Dataset

# Split MBTI into dimensions and create binary labels for twitter
sample_df['EI'] = sample_df['mbti'].apply(lambda x: 1 if x[0] == 'E' else 0)
sample_df['SN'] = sample_df['mbti'].apply(lambda x: 1 if x[1] == 'N' else 0)
sample_df['TF'] = sample_df['mbti'].apply(lambda x: 1 if x[2] == 'T' else 0)
sample_df['JP'] = sample_df['mbti'].apply(lambda x: 1 if x[3] == 'P' else 0)

# Dimensions
dimensions = ['EI', 'SN', 'TF', 'JP']

# Define the feature union
feature_union = FeatureUnion([
    ('word_tfidf', TfidfVectorizer(analyzer='word', max_features=1000, ngram_range=(1, 2))),
    ('char_tfidf', TfidfVectorizer(analyzer='char', max_features=1000, ngram_range=(2, 4)))
])

# Define pipelines
pipelines = {
    'lr': Pipeline([
        ('tfidf', feature_union),
        ('lr', LogisticRegression(random_state=42, class_weight='balanced', n_jobs=-1))
    ]),
    'rf': Pipeline([
        ('tfidf', feature_union),
        ('rf', RandomForestClassifier(random_state=42, class_weight='balanced', n_jobs=-1))
    ]),
    'svc': Pipeline([
        ('tfidf', feature_union),
        ('svc', LinearSVC(random_state=42, class_weight='balanced'))
    ])
}

# Train and evaluate traditional ML models
for dimension, reddit_df in zip(dimensions, reddit_dfs):
    print(f"Training and evaluating for dimension: {dimension}")

    # Twitter training data
    X_train = sample_df['clean_tweet']
    y_train = sample_df[dimension]

    # Clean Reddit data
    X_test = reddit_df['clean_text']
    y_test = reddit_df.iloc[:, 2]

    for model_name, pipeline in pipelines.items():
        print(f"Training {model_name} on dimension {dimension}")

        # Train the model
        pipeline.fit(X_train, y_train)

        # Predict and evaluate
        y_pred = pipeline.predict(X_test)
        print(f"Results for {model_name} on {dimension} dimension:")
        print(classification_report(y_test, y_pred))

# BERT Model
# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Train and evaluate BERT for each dimension
for dimension in dimensions:
    print(f"Training and evaluating BERT for dimension: {dimension}")

    # Twitter training data
    X_train = sample_df['clean_tweet']
    y_train = sample_df[dimension]

    # Tokenize Twitter data
    tokenized_train = tokenizer(X_train.tolist(), padding=True, truncation=True, max_length=128, return_tensors="tf")

    # Prepare the training dataset
    train_dataset = tf.data.Dataset.from_tensor_slices((
        {'input_ids': tokenized_train['input_ids'], 'attention_mask': tokenized_train['attention_mask']}, y_train
    )).batch(32)

    # Train and evaluate on Reddit data
    for reddit_df in reddit_dfs:
        # Reddit test data
        X_test = reddit_df['clean_text']
        y_test = reddit_df.iloc[:, 2]  # Assuming the labels are in the third column

        # Tokenize Reddit data
        tokenized_test = tokenizer(X_test.tolist(), padding=True, truncation=True, max_length=128, return_tensors="tf")

        # Prepare the test dataset
        test_dataset = tf.data.Dataset.from_tensor_slices((
            {'input_ids': tokenized_test['input_ids'], 'attention_mask': tokenized_test['attention_mask']}, y_test
        )).batch(32)

        # Load and compile the BERT model
        model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
                      loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                      metrics=['accuracy'])

        # Train the BERT model
        model.fit(train_dataset, epochs=3)

        # Evaluate the BERT model on Reddit data
        bert_results = model.evaluate(test_dataset)
        print(f"BERT Results for {dimension} dimension on Reddit data: {bert_results}")