## Environment Configuration

In [1]:
!pip install -U emoji
!pip install -U tqdm
!pip install -U transformers
!pip install -U kaleido
!pip install -U dash
!pip install -U optuna
!pip install -U scikit-learn

Collecting emoji
  Downloading emoji-2.11.1-py2.py3-none-any.whl (433 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m433.8/433.8 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.11.1
Collecting transformers
  Downloading transformers-4.40.0-py3-none-any.whl (9.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m48.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.20,>=0.19 (from transformers)
  Downloading tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m86.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.15.2
    Uninstalling tokenizers-0.15.2:
      Successfully uninstalled tokenizers-0.15.2
  Attempting uninstall

In [2]:
# General Dataloaders
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
# NLP
import re
import string
import emoji
from collections import Counter
from transformers import pipeline
# Data Visualization
import plotly.express as px
# Machine Learning
import torch
# Data Pipeline
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
# Tokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
# Models
from sklearn.linear_model import LogisticRegression
import joblib
# Hyperparameter Tuning
import optuna
from optuna.storages import JournalStorage, JournalFileStorage
import matplotlib.pyplot as plt
# Logistics
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
from timeit import default_timer as timer

In [3]:
dv = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(dv)

cpu


## Augmentation Functions

In [None]:
def clean_text(x: str) -> str:
  """
  Goals:
    1) Remove punctuation to make the TF-IDF dictionary more accurate
    2) Remove emojis, they are unnecessary here for training purposes. (We will use emoji data in another column :3)
  """
  clean_text = x.translate(str.maketrans('', '', string.punctuation))
  clean_text = emoji.replace_emoji(clean_text, replace='')
  return clean_text

In [None]:
def num_emojis(x: str) -> int:
  return emoji.emoji_count(x)

In [None]:
def average_words_per_sentence(message: str):
  """
  Unfortunately, due to the limitation of a non-standard ASCII table for emojis, getting the
  average words per second is not 100% accurate in certain cases.

  Please modify the regular expression for, 'sentences' to match your need if you re-use this code. Thanks :)
  """
  # Split the string based on {., !, ?} characters
  # print(f'Base Message: {message}')
  sentences = re.split('[.!?\u263a-\U0001f645\n]+', message.strip().replace('\n', ''))
  avg_words = 0
  for entry in sentences:
    # print(f'Entry: {entry}')
    words = re.split('[,;: ]', entry.strip())
    # print(f'Words: {words}')
    avg_words += len(words)
  return round(avg_words / len(sentences), 2)

In [None]:
sentiment = pipeline(
    task='sentiment-analysis',
    model= "cardiffnlp/twitter-roberta-base-sentiment-latest", #@param {type:"string"},
    tokenizer = "cardiffnlp/twitter-roberta-base-sentiment-latest", #@param {type:"string"}
    max_length=512,
    batch_size = 25000,
    truncation=True,
    padding=True,
    device=dv
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

## Model Training

In [12]:
messages = pd.read_parquet('/content/drive/MyDrive/Graduate/Research/SU24/Suicide/Code/Augments/UPDATED_Suicide_Detection.csv.parquet.gzip')
messages = messages.drop(['text'], axis=1)
messages.dropna(axis = 0)
messages.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232074 entries, 0 to 232073
Data columns (total 6 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Unnamed: 0                  232074 non-null  int64  
 1   clean_text                  232074 non-null  object 
 2   average_words_per_sentence  232074 non-null  float64
 3   sentiment                   232074 non-null  object 
 4   num_emojis                  232074 non-null  int64  
 5   class                       232074 non-null  object 
dtypes: float64(1), int64(2), object(3)
memory usage: 10.6+ MB


In [13]:
X = messages[['clean_text', 'average_words_per_sentence', 'sentiment', 'num_emojis']]
y = messages['class']
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    train_size = 0.8,
    random_state = 5
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train,
    y_train,
    train_size = 0.8,
    random_state = 21
)

In [14]:
tokenize = ['clean_text']
numerical = ['num_emojis', 'average_words_per_sentence']
categorical = ['sentiment']

In [15]:
tokenize_pipeline = Pipeline(
    steps = [
        ('text', TfidfVectorizer())
    ]
)

In [16]:
numerical_pipeline = Pipeline(
    steps = [
        ('impute', SimpleImputer(strategy='mean')),
        ('scale', StandardScaler()),
    ]
)

In [17]:
categorical_pipeline = Pipeline(
    steps = [
        ('impute', SimpleImputer(strategy = 'most_frequent')),
        ('one-hot-encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ]
)

In [18]:
column_transformer = ColumnTransformer(
    transformers = [
        ('num_pipeline', numerical_pipeline, numerical),
        ('cat_pipeline', categorical_pipeline, categorical),
        ('tfidf', tokenize_pipeline, 'clean_text')

    ],
    remainder = 'passthrough',
    n_jobs = -1,
    verbose=1
)

In [19]:
lr = LogisticRegression(
    n_jobs = -1,
    verbose = 1
)

model = make_pipeline(
    column_transformer,
    lr
)

In [None]:
model.fit(X_train, y_train)

  pid = os.fork()
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


In [None]:
joblib.dump(model, '/content/drive/MyDrive/Graduate/Research/SU24/Suicide/Models/Base/lr_base_model.pkl')

['/content/drive/MyDrive/Graduate/Research/SU24/Suicide/Models/Base/lr_base_model.pkl']

## Model Evaluation

In [None]:
model = joblib.load('/content/drive/MyDrive/Graduate/Research/SU24/Suicide/Models/Base/lr_base_model.pkl')

In [20]:
def evaluate_model(model):
  y_predict_train = model.predict(X_train)
  y_predict_val = model.predict(X_val)
  y_predict_test = model.predict(X_test)
  target_names = model.classes_
  print(f'===== (Training) Classification Report =====\n{classification_report(y_train, y_predict_train, target_names = target_names, digits=4)}')
  print(f'===== (Validation) Classification Report =====\n{classification_report(y_val, y_predict_val, target_names = target_names, digits=4)}')
  print(f'===== (Evaluation) Classification Report =====\n{classification_report(y_test, y_predict_test, target_names = target_names, digits=4)}')


In [None]:
evaluate_model(model)

  pid = os.fork()


===== (Training) Classification Report =====
              precision    recall  f1-score   support

 non-suicide     0.9368    0.9533    0.9449     74160
     suicide     0.9526    0.9358    0.9441     74367

    accuracy                         0.9445    148527
   macro avg     0.9447    0.9445    0.9445    148527
weighted avg     0.9447    0.9445    0.9445    148527

===== (Validation) Classification Report =====
              precision    recall  f1-score   support

 non-suicide     0.9317    0.9446    0.9381     18619
     suicide     0.9435    0.9304    0.9369     18513

    accuracy                         0.9375     37132
   macro avg     0.9376    0.9375    0.9375     37132
weighted avg     0.9376    0.9375    0.9375     37132

===== (Evaluation) Classification Report =====
              precision    recall  f1-score   support

 non-suicide     0.9314    0.9437    0.9375     23258
     suicide     0.9427    0.9302    0.9364     23157

    accuracy                         0.9370

## Model Finetuning

In [21]:
# Step 1: Get the Column Transformer
column_transformer = model.steps[0][1]
print(column_transformer)

ColumnTransformer(n_jobs=-1, remainder='passthrough',
                  transformers=[('num_pipeline',
                                 Pipeline(steps=[('impute', SimpleImputer()),
                                                 ('scale', StandardScaler())]),
                                 ['num_emojis', 'average_words_per_sentence']),
                                ('cat_pipeline',
                                 Pipeline(steps=[('impute',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('one-hot-encoder',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse_output=False))]),
                                 ['sentiment']),
                                ('tfidf',
                                 Pipeline(steps=[('text', TfidfVectorizer())]),
                                 'clean_

In [22]:
# Step 2: Get the Random Forest Classifier
lr_base_params = model.steps[1][1]
print(lr_base_params.get_params())

{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': -1, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 1, 'warm_start': False}


In [23]:
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
print(label_encoder.classes_)

['non-suicide' 'suicide']


In [24]:
# TODO
def objective(trial):
  max_iter = trial.suggest_int('max_iter', 100, 500)
  C = trial.suggest_float('C', 1.0, 100)
  tol = trial.suggest_float('tol', 1e-4, 1e-2)

  lr = LogisticRegression(
      max_iter = max_iter,
      C = C,
      tol = tol,
      verbose = 1
      )

  model = make_pipeline(
      column_transformer,
      lr
  )

  score = cross_val_score(model, X_train, y_train_encoded, cv=5, scoring = 'f1')
  return score.mean()

In [25]:
db = '/content/drive/MyDrive/Graduate/Research/SU24/Suicide/Models/Checkpoints/lr_tuning.db'

In [26]:
study = optuna.create_study(
    study_name = 'lr_tuning',
    direction = 'maximize',
    sampler = optuna.samplers.RandomSampler(),
    storage = f'sqlite:///{db}',
    load_if_exists = True
    )

[I 2024-04-21 17:37:36,197] Using an existing study with name 'lr_tuning' instead of creating a new one.


In [None]:
# Identify failed trials
failed_trials = [trial for trial in study.trials if trial.state == optuna.trial.TrialState.FAIL]

# Reset failed trials
for trial in failed_trials:
  print(f'Re-running failed trial with information: {trial}')
  study.enqueue_trial(trial.params)

Re-running failed trial with information: FrozenTrial(number=53, state=TrialState.FAIL, values=None, datetime_start=datetime.datetime(2024, 4, 19, 18, 38, 30, 9053), datetime_complete=datetime.datetime(2024, 4, 19, 18, 38, 33, 284353), params={'max_iter': 217, 'C': 91.44912386094573, 'tol': 0.009030437975422011}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'max_iter': IntDistribution(high=500, log=False, low=100, step=1), 'C': FloatDistribution(high=100.0, log=False, low=1.0, step=None), 'tol': FloatDistribution(high=0.01, log=False, low=0.0001, step=None)}, trial_id=54, value=None)


In [None]:
print(study.trials_dataframe())

    number     value             datetime_start          datetime_complete  \
0        0  0.863868 2024-04-19 16:43:59.497525 2024-04-19 16:45:57.947042   
1        1  0.863860 2024-04-19 16:45:58.022364 2024-04-19 16:47:47.489360   
2        2  0.863868 2024-04-19 16:47:47.552946 2024-04-19 16:49:37.222288   
3        3  0.863868 2024-04-19 16:49:37.285294 2024-04-19 16:51:32.159577   
4        4  0.905255 2024-04-19 16:51:32.234402 2024-04-19 16:53:28.856995   
5        5  0.863868 2024-04-19 16:53:28.921387 2024-04-19 16:55:21.019827   
6        6  0.922884 2024-04-19 16:55:21.093559 2024-04-19 16:57:26.469535   
7        7  0.863868 2024-04-19 16:57:26.531827 2024-04-19 16:59:18.894649   
8        8  0.927968 2024-04-19 16:59:18.952553 2024-04-19 17:01:36.794660   
9        9  0.911608 2024-04-19 17:01:36.850701 2024-04-19 17:03:36.437385   
10      10  0.863868 2024-04-19 17:03:36.497931 2024-04-19 17:05:28.981173   
11      11  0.863868 2024-04-19 17:05:29.040747 2024-04-19 17:07

In [None]:
study.optimize(objective, n_trials=100)

  pid = os.fork()
[I 2024-04-21 12:07:05,174] Trial 54 finished with value: 0.8638680174570709 and parameters: {'max_iter': 217, 'C': 91.44912386094573, 'tol': 0.009030437975422011}. Best is trial 21 with value: 0.9391429126843093.
[I 2024-04-21 12:08:59,337] Trial 55 finished with value: 0.9052339098515765 and parameters: {'max_iter': 374, 'C': 82.10932506321048, 'tol': 0.003806134907760717}. Best is trial 21 with value: 0.9391429126843093.
[I 2024-04-21 12:10:55,084] Trial 56 finished with value: 0.8638680174570709 and parameters: {'max_iter': 451, 'C': 40.15047520989145, 'tol': 0.007496031645170739}. Best is trial 21 with value: 0.9391429126843093.
[I 2024-04-21 12:12:45,426] Trial 57 finished with value: 0.8638680174570709 and parameters: {'max_iter': 329, 'C': 84.41379009066341, 'tol': 0.009835184811287565}. Best is trial 21 with value: 0.9391429126843093.
[I 2024-04-21 12:14:38,152] Trial 58 finished with value: 0.8638680174570709 and parameters: {'max_iter': 442, 'C': 23.9328294

In [30]:
# params = study.best_params
params = {
    'max_iter': 201,
    'C': 2.0465939108330833,
    'tol': 0.00016632255879392817
    }

In [32]:
best_lr_model = LogisticRegression(
    **params,
    )


best_lr_model = make_pipeline(
    column_transformer,
    best_lr_model
)

In [33]:
best_lr_model.fit(X_train, y_train)

  pid = os.fork()


In [34]:
evaluate_model(best_lr_model)

===== (Training) Classification Report =====
              precision    recall  f1-score   support

 non-suicide     0.9479    0.9586    0.9532     74160
     suicide     0.9583    0.9474    0.9528     74367

    accuracy                         0.9530    148527
   macro avg     0.9531    0.9530    0.9530    148527
weighted avg     0.9531    0.9530    0.9530    148527

===== (Validation) Classification Report =====
              precision    recall  f1-score   support

 non-suicide     0.9367    0.9462    0.9414     18619
     suicide     0.9454    0.9357    0.9405     18513

    accuracy                         0.9410     37132
   macro avg     0.9410    0.9410    0.9410     37132
weighted avg     0.9410    0.9410    0.9410     37132

===== (Evaluation) Classification Report =====
              precision    recall  f1-score   support

 non-suicide     0.9341    0.9459    0.9400     23258
     suicide     0.9450    0.9329    0.9389     23157

    accuracy                         0.9394

In [35]:
joblib.dump(model, '/content/drive/MyDrive/Graduate/Research/SU24/Suicide/Models/Base/best_lr_model.pkl')

['/content/drive/MyDrive/Graduate/Research/SU24/Suicide/Models/Base/best_lr_model.pkl']

In [36]:
optuna.visualization.plot_optimization_history(study)

In [37]:
optuna.visualization.plot_parallel_coordinate(study)

In [40]:
optuna.visualization.plot_slice(study, params=['max_iter', 'C', 'tol'])

In [39]:
optuna.visualization.plot_param_importances(study)