## Environment Configuration

In [1]:
!pip install -U emoji
!pip install -U tqdm
!pip install -U transformers
!pip install -U kaleido
!pip install -U dash
!pip install -U optuna
!pip install -U scikit-learn

Collecting emoji
  Downloading emoji-2.11.1-py2.py3-none-any.whl (433 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/433.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━[0m [32m276.5/433.8 kB[0m [31m8.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m433.8/433.8 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.11.1
Collecting transformers
  Downloading transformers-4.40.0-py3-none-any.whl (9.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.20,>=0.19 (from transformers)
  Downloading tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m36.3 MB/s[0m eta [36m0:00:0

In [2]:
# General Dataloaders
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
# NLP
import re
import string
import emoji
from collections import Counter
from transformers import pipeline
# Data Visualization
import plotly.express as px
# Machine Learning
import torch
# Data Pipeline
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
# Tokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
# Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
import joblib
# Hyperparameter Tuning
import optuna
from optuna.storages import JournalStorage, JournalFileStorage
import matplotlib.pyplot as plt
# Logistics
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
from timeit import default_timer as timer
import numpy as np

In [3]:
dv = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(dv)

cpu


## Augmentation Functions

In [4]:
def clean_text(x: str) -> str:
  """
  Goals:
    1) Remove punctuation to make the TF-IDF dictionary more accurate
    2) Remove emojis, they are unnecessary here for training purposes. (We will use emoji data in another column :3)
  """
  clean_text = x.translate(str.maketrans('', '', string.punctuation))
  clean_text = emoji.replace_emoji(clean_text, replace='')
  return clean_text

In [5]:
def num_emojis(x: str) -> int:
  return emoji.emoji_count(x)

In [6]:
def average_words_per_sentence(message: str):
  """
  Unfortunately, due to the limitation of a non-standard ASCII table for emojis, getting the
  average words per second is not 100% accurate in certain cases.

  Please modify the regular expression for, 'sentences' to match your need if you re-use this code. Thanks :)
  """
  # Split the string based on {., !, ?} characters
  # print(f'Base Message: {message}')
  sentences = re.split('[.!?\u263a-\U0001f645\n]+', message.strip().replace('\n', ''))
  avg_words = 0
  for entry in sentences:
    # print(f'Entry: {entry}')
    words = re.split('[,;: ]', entry.strip())
    # print(f'Words: {words}')
    avg_words += len(words)
  return round(avg_words / len(sentences), 2)

In [None]:
sentiment = pipeline(
    task='sentiment-analysis',
    model= "cardiffnlp/twitter-roberta-base-sentiment-latest", #@param {type:"string"},
    tokenizer = "cardiffnlp/twitter-roberta-base-sentiment-latest", #@param {type:"string"}
    max_length=512,
    batch_size = 25000,
    truncation=True,
    padding=True,
    device=dv
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

## Model Training

In [7]:
messages = pd.read_parquet('/content/drive/MyDrive/Graduate/Research/SU24/Suicide/Code/Augments/UPDATED_Suicide_Detection.csv.parquet.gzip')
messages = messages.drop(['text'], axis=1)
messages.dropna(axis = 0)
messages.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232074 entries, 0 to 232073
Data columns (total 6 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Unnamed: 0                  232074 non-null  int64  
 1   clean_text                  232074 non-null  object 
 2   average_words_per_sentence  232074 non-null  float64
 3   sentiment                   232074 non-null  object 
 4   num_emojis                  232074 non-null  int64  
 5   class                       232074 non-null  object 
dtypes: float64(1), int64(2), object(3)
memory usage: 10.6+ MB


In [8]:
X = messages[['clean_text', 'average_words_per_sentence', 'sentiment', 'num_emojis']]
y = messages['class']
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    train_size = 0.8,
    random_state = 5
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train,
    y_train,
    train_size = 0.8,
    random_state = 21
)

In [9]:
tokenize = ['clean_text']
numerical = ['num_emojis', 'average_words_per_sentence']
categorical = ['sentiment']

In [10]:
tokenize_pipeline = Pipeline(
    steps = [
        ('text', TfidfVectorizer())
    ]
)

In [11]:
numerical_pipeline = Pipeline(
    steps = [
        ('impute', SimpleImputer(strategy='mean')),
        ('scale', StandardScaler()),
    ]
)

In [12]:
categorical_pipeline = Pipeline(
    steps = [
        ('impute', SimpleImputer(strategy = 'most_frequent')),
        ('one-hot-encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ]
)

In [13]:
column_transformer = ColumnTransformer(
    transformers = [
        ('num_pipeline', numerical_pipeline, numerical),
        ('cat_pipeline', categorical_pipeline, categorical),
        ('tfidf', tokenize_pipeline, 'clean_text')
    ],
    remainder = 'passthrough',
    n_jobs = -1,
    verbose=1
)

In [14]:
rf = RandomForestClassifier(
    verbose = 1,
    n_jobs = -1
)

lr = LogisticRegression(
    verbose = 1,
    n_jobs = -1
)

gbc = GradientBoostingClassifier(
    verbose = 1
)

In [15]:
model_1 = make_pipeline(
    column_transformer,
    rf
)

model_2 = make_pipeline(
    column_transformer,
    lr
)

model_3 = make_pipeline(
    column_transformer,
    gbc
)

In [16]:
model = VotingClassifier(
    estimators = [
        ('rf', model_1),
        ('lr', model_2),
        ('gbc', model_3)
        ],
    voting = 'hard',
    weights = [1, 2, 1]
)

In [17]:
model.fit(X_train, y_train)

  pid = os.fork()
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 18.2min finished
  pid = os.fork()
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


      Iter       Train Loss   Remaining Time 
         1           1.3053           11.63m
         2           1.2381           11.31m
         3           1.1807           11.03m
         4           1.1296           11.00m
         5           1.0870           10.81m
         6           1.0505           10.75m
         7           1.0168           10.63m
         8           0.9887           10.53m
         9           0.9623           10.46m
        10           0.9398           10.32m
        20           0.7910            9.22m
        30           0.7157            8.07m
        40           0.6695            6.92m
        50           0.6383            5.78m
        60           0.6155            4.63m
        70           0.5972            3.46m
        80           0.5825            2.31m
        90           0.5699            1.16m
       100           0.5587            0.00s


In [18]:
joblib.dump(model, '/content/drive/MyDrive/Graduate/Research/SU24/Suicide/Models/Base/vc_base_model.pkl')

['/content/drive/MyDrive/Graduate/Research/SU24/Suicide/Models/Base/vc_base_model.pkl']

## Model Evaluation

In [19]:
model = joblib.load('/content/drive/MyDrive/Graduate/Research/SU24/Suicide/Models/Base/vc_base_model.pkl')

In [20]:
def evaluate_model(model):
  y_predict_train = model.predict(X_train)
  y_predict_val = model.predict(X_val)
  y_predict_test = model.predict(X_test)
  target_names = model.classes_
  print(f'===== (Training) Classification Report =====\n{classification_report(y_train, y_predict_train, target_names = target_names, digits=4)}')
  print(f'===== (Validation) Classification Report =====\n{classification_report(y_val, y_predict_val, target_names = target_names, digits=4)}')
  print(f'===== (Evaluation) Classification Report =====\n{classification_report(y_test, y_predict_test, target_names = target_names, digits=4)}')


In [21]:
evaluate_model(model)

  pid = os.fork()
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    5.6s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:   13.8s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    1.4s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    3.7s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    1.9s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    3.7s finished


===== (Training) Classification Report =====
              precision    recall  f1-score   support

 non-suicide     0.9376    0.9678    0.9525     74160
     suicide     0.9668    0.9358    0.9511     74367

    accuracy                         0.9518    148527
   macro avg     0.9522    0.9518    0.9518    148527
weighted avg     0.9523    0.9518    0.9518    148527

===== (Validation) Classification Report =====
              precision    recall  f1-score   support

 non-suicide     0.8980    0.9572    0.9266     18619
     suicide     0.9539    0.8906    0.9212     18513

    accuracy                         0.9240     37132
   macro avg     0.9259    0.9239    0.9239     37132
weighted avg     0.9259    0.9240    0.9239     37132

===== (Evaluation) Classification Report =====
              precision    recall  f1-score   support

 non-suicide     0.8977    0.9569    0.9264     23258
     suicide     0.9537    0.8905    0.9210     23157

    accuracy                         0.9238

## Model Finetuning

In [None]:
rf = RandomForestClassifier(
    verbose = 1,
    n_jobs = -1
)

lr = LogisticRegression(
    verbose = 1,
    n_jobs = -1
)

gbc = GradientBoostingClassifier(
    verbose = 1
)

model_1 = make_pipeline(
    column_transformer,
    rf
)

model_2 = make_pipeline(
    column_transformer,
    lr
)

model_3 = make_pipeline(
    column_transformer,
    gbc
)

In [12]:
def objective(trial):
  estimators = [
      ('rf', model_1),
      ('lr', model_2),
      ('gbc', model_3)
  ]
  voting = trial.suggest_categorical('voting', ['hard', 'soft'])
  weights = [trial.suggest_float(f'weight_{i}', 1, 2) for i in range(3)]


  vc = VotingClassifier(
      estimators = estimators,
      voting = voting,
      weights = weights,
      voting = 'soft'
  )

  score = cross_val_score(vc, X_train, y_train, cv=5, scoring = 'f1')
  return score.mean()

In [13]:
db = '/content/drive/MyDrive/Graduate/Research/SU24/Suicide/Models/Checkpoints/vc_tuning.db'

In [14]:
study = optuna.create_study(
    study_name = 'vc_tuning',
    direction = 'maximize',
    sampler = optuna.samplers.RandomSampler(),
    storage = f'sqlite:///{db}',
    load_if_exists = True
    )

[I 2024-04-21 20:19:41,011] A new study created in RDB with name: vc_tuning


In [15]:
# Identify failed trials
failed_trials = [trial for trial in study.trials if trial.state == optuna.trial.TrialState.FAIL]

# Reset failed trials
for trial in failed_trials:
  print(f'Re-running failed trial with information: {trial}')
  study.enqueue_trial(trial.params)

In [16]:
print(study.trials_dataframe())

Empty DataFrame
Columns: []
Index: []


In [17]:
study.optimize(objective, n_trials=10)

  pid = os.fork()
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  8.8min
[W 2024-04-21 20:29:29,531] Trial 0 failed with parameters: {'voting': 'hard', 'weight_0': 1.3896151098675555, 'weight_1': 1.4192694086707358, 'weight_2': 1.334711247879382} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "<ipython-input-12-8fde5da47eb3>", line 17, in objective
    score = cross_val_score(vc, X_train, y_train, cv=5, scoring = 'f1')
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 213, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 719, in cross_val_score
    cv_results = cross_validate(
  File "/usr/loca

KeyboardInterrupt: 

In [None]:
params = study.best_params

In [None]:
best_vc_model = RandomForestClassifier(
    **params,
    n_jobs = -1,
    verbose = 1,
    voting = 'soft'
    )


best_vc_model = make_pipeline(
    column_transformer,
    best_vc_model
)

In [None]:
best_vc_model.fit(X_train, y_train)

  pid = os.fork()
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   23.3s
[Parallel(n_jobs=-1)]: Done 110 out of 110 | elapsed:   40.1s finished


In [None]:
evaluate_model(best_vc_model)

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    6.8s
[Parallel(n_jobs=2)]: Done 110 out of 110 | elapsed:   12.7s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    1.0s
[Parallel(n_jobs=2)]: Done 110 out of 110 | elapsed:    2.4s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    1.2s
[Parallel(n_jobs=2)]: Done 110 out of 110 | elapsed:    2.8s finished


===== (Training) Classification Report =====
              precision    recall  f1-score   support

 non-suicide     0.8225    0.8731    0.8470     74160
     suicide     0.8652    0.8121    0.8378     74367

    accuracy                         0.8426    148527
   macro avg     0.8438    0.8426    0.8424    148527
weighted avg     0.8439    0.8426    0.8424    148527

===== (Validation) Classification Report =====
              precision    recall  f1-score   support

 non-suicide     0.8199    0.8651    0.8419     18619
     suicide     0.8563    0.8088    0.8319     18513

    accuracy                         0.8370     37132
   macro avg     0.8381    0.8370    0.8369     37132
weighted avg     0.8380    0.8370    0.8369     37132

===== (Evaluation) Classification Report =====
              precision    recall  f1-score   support

 non-suicide     0.8212    0.8698    0.8448     23258
     suicide     0.8610    0.8098    0.8346     23157

    accuracy                         0.8399

In [None]:
joblib.dump(model, '/content/drive/MyDrive/Graduate/Research/SU24/Suicide/Models/Base/best_vc_model.pkl')

['/content/drive/MyDrive/Graduate/Research/SU24/Suicide/Models/Base/best_rf_model.pkl']

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_parallel_coordinate(study)

In [None]:
optuna.visualization.plot_param_importances(study)