## Environment Configuration

In [1]:
!pip install -U emoji
!pip install -U tqdm
!pip install -U transformers
!pip install -U kaleido
!pip install -U dash
!pip install -U optuna
!pip install -U scikit-learn

Collecting emoji
  Downloading emoji-2.11.0-py2.py3-none-any.whl (433 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m433.8/433.8 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.11.0
Collecting transformers
  Downloading transformers-4.40.0-py3-none-any.whl (9.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.20,>=0.19 (from transformers)
  Downloading tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m34.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.15.2
    Uninstalling tokenizers-0.15.2:
      Successfully uninstalled tokenizers-0.15.2
  Attempting uninstall

In [2]:
# General Dataloaders
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
# NLP
import re
import string
import emoji
from collections import Counter
from transformers import pipeline
# Data Visualization
import plotly.express as px
# Machine Learning
import torch
# Data Pipeline
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
# Tokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
# Models
from sklearn.ensemble import RandomForestClassifier
import joblib
# Hyperparameter Tuning
import optuna
from optuna.storages import JournalStorage, JournalFileStorage
import matplotlib.pyplot as plt
# Logistics
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
from timeit import default_timer as timer

In [3]:
dv = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(dv)

cpu


## Augmentation Functions

In [None]:
def clean_text(x: str) -> str:
  """
  Goals:
    1) Remove punctuation to make the TF-IDF dictionary more accurate
    2) Remove emojis, they are unnecessary here for training purposes. (We will use emoji data in another column :3)
  """
  clean_text = x.translate(str.maketrans('', '', string.punctuation))
  clean_text = emoji.replace_emoji(clean_text, replace='')
  return clean_text

In [None]:
def num_emojis(x: str) -> int:
  return emoji.emoji_count(x)

In [None]:
def average_words_per_sentence(message: str):
  """
  Unfortunately, due to the limitation of a non-standard ASCII table for emojis, getting the
  average words per second is not 100% accurate in certain cases.

  Please modify the regular expression for, 'sentences' to match your need if you re-use this code. Thanks :)
  """
  # Split the string based on {., !, ?} characters
  # print(f'Base Message: {message}')
  sentences = re.split('[.!?\u263a-\U0001f645\n]+', message.strip().replace('\n', ''))
  avg_words = 0
  for entry in sentences:
    # print(f'Entry: {entry}')
    words = re.split('[,;: ]', entry.strip())
    # print(f'Words: {words}')
    avg_words += len(words)
  return round(avg_words / len(sentences), 2)

In [None]:
sentiment = pipeline(
    task='sentiment-analysis',
    model= "cardiffnlp/twitter-roberta-base-sentiment-latest", #@param {type:"string"},
    tokenizer = "cardiffnlp/twitter-roberta-base-sentiment-latest", #@param {type:"string"}
    max_length=512,
    batch_size = 25000,
    truncation=True,
    padding=True,
    device=dv
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

## Model Training

In [4]:
messages = pd.read_parquet('/content/drive/MyDrive/Graduate/Research/SU24/Suicide/Code/Augments/UPDATED_Suicide_Detection.csv.parquet.gzip')
messages = messages.drop(['text'], axis=1)
messages.dropna(axis = 0)
messages.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232074 entries, 0 to 232073
Data columns (total 6 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Unnamed: 0                  232074 non-null  int64  
 1   clean_text                  232074 non-null  object 
 2   average_words_per_sentence  232074 non-null  float64
 3   sentiment                   232074 non-null  object 
 4   num_emojis                  232074 non-null  int64  
 5   class                       232074 non-null  object 
dtypes: float64(1), int64(2), object(3)
memory usage: 10.6+ MB


In [5]:
X = messages[['clean_text', 'average_words_per_sentence', 'sentiment', 'num_emojis']]
y = messages['class']
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    train_size = 0.8,
    random_state = 5
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train,
    y_train,
    train_size = 0.8,
    random_state = 21
)

In [None]:
tokenize = ['clean_text']
numerical = ['num_emojis', 'average_words_per_sentence']
categorical = ['sentiment']

In [None]:
tokenize_pipeline = Pipeline(
    steps = [
        ('text', TfidfVectorizer())
    ]
)

In [None]:
numerical_pipeline = Pipeline(
    steps = [
        ('impute', SimpleImputer(strategy='mean')),
        ('scale', StandardScaler()),
    ]
)

In [None]:
categorical_pipeline = Pipeline(
    steps = [
        ('impute', SimpleImputer(strategy = 'most_frequent')),
        ('one-hot-encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ]
)

In [None]:
column_transformer = ColumnTransformer(
    transformers = [
        ('num_pipeline', numerical_pipeline, numerical),
        ('cat_pipeline', categorical_pipeline, categorical),
        ('tfidf', tokenize_pipeline, 'clean_text')
    ],
    remainder = 'passthrough',
    n_jobs = -1,
    verbose=1
)

In [None]:
rf = RandomForestClassifier(
    verbose = 1,
    n_jobs = -1
)
model = make_pipeline(
    column_transformer,
    rf
)

In [None]:
model.fit(X_train, y_train)

  pid = os.fork()
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed: 12.9min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 27.7min finished


In [None]:
joblib.dump(model, '/content/drive/MyDrive/Graduate/Research/SU24/Suicide/Models/Base/rf_base_model.pkl')

['/content/drive/MyDrive/Graduate/Research/SU24/Suicide/Models/Base/rf_base_model.pkl']

## Model Evaluation

In [6]:
model = joblib.load('/content/drive/MyDrive/Graduate/Research/SU24/Suicide/Models/Base/rf_base_model.pkl')

In [None]:
def evaluate_model(model):
  y_predict_train = model.predict(X_train)
  y_predict_val = model.predict(X_val)
  y_predict_test = model.predict(X_test)
  target_names = model.classes_
  print(f'===== (Training) Classification Report =====\n{classification_report(y_train, y_predict_train, target_names = target_names, digits=4)}')
  print(f'===== (Validation) Classification Report =====\n{classification_report(y_val, y_predict_val, target_names = target_names, digits=4)}')
  print(f'===== (Evaluation) Classification Report =====\n{classification_report(y_test, y_predict_test, target_names = target_names, digits=4)}')


In [None]:
evaluate_model(model)

  pid = os.fork()
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    8.2s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:   15.7s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    1.6s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    3.5s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    2.9s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    5.3s finished


===== (Training) Classification Report =====
              precision    recall  f1-score   support

 non-suicide     1.0000    1.0000    1.0000     74160
     suicide     1.0000    1.0000    1.0000     74367

    accuracy                         1.0000    148527
   macro avg     1.0000    1.0000    1.0000    148527
weighted avg     1.0000    1.0000    1.0000    148527

===== (Validation) Classification Report =====
              precision    recall  f1-score   support

 non-suicide     0.8890    0.8919    0.8904     18619
     suicide     0.8909    0.8880    0.8894     18513

    accuracy                         0.8899     37132
   macro avg     0.8899    0.8899    0.8899     37132
weighted avg     0.8899    0.8899    0.8899     37132

===== (Evaluation) Classification Report =====
              precision    recall  f1-score   support

 non-suicide     0.8916    0.8912    0.8914     23258
     suicide     0.8908    0.8911    0.8910     23157

    accuracy                         0.8912

## Model Finetuning

In [7]:
# Step 1: Get the Column Transformer
column_transformer = model.steps[0][1]
print(column_transformer)

ColumnTransformer(n_jobs=-1, remainder='passthrough',
                  transformers=[('num_pipeline',
                                 Pipeline(steps=[('impute', SimpleImputer()),
                                                 ('scale', StandardScaler())]),
                                 ['num_emojis', 'average_words_per_sentence']),
                                ('cat_pipeline',
                                 Pipeline(steps=[('impute',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('one-hot-encoder',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse_output=False))]),
                                 ['sentiment']),
                                ('tfidf',
                                 Pipeline(steps=[('text', TfidfVectorizer())]),
                                 'clean_

In [8]:
# Step 2: Get the Random Forest Classifier
rf_base_params = model.steps[1][1]
print(rf_base_params.get_params())

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 100, 'n_jobs': -1, 'oob_score': False, 'random_state': None, 'verbose': 1, 'warm_start': False}


In [9]:
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
print(label_encoder.classes_)

['non-suicide' 'suicide']


In [10]:
def objective(trial):
  n_estimators = trial.suggest_int('n_estimators', 100, 600)
  max_depth = trial.suggest_int('max_depth', 10, 50)
  min_samples_split = trial.suggest_int('min_samples_split', 2, 50)
  min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 50)

  rf = RandomForestClassifier(
      n_estimators = n_estimators,
      max_depth = max_depth,
      min_samples_split = min_samples_split,
      min_samples_leaf = min_samples_leaf,
      n_jobs = -1
  )

  model = make_pipeline(
      column_transformer,
      rf
  )

  score = cross_val_score(model, X_train, y_train_encoded, cv=5, scoring = 'f1')
  return score.mean()

In [11]:
db = '/content/drive/MyDrive/Graduate/Research/SU24/Suicide/Models/Checkpoints/rf_tuning.db'

In [12]:
study = optuna.create_study(
    study_name = 'rf_tuning',
    direction = 'maximize',
    sampler = optuna.samplers.RandomSampler(),
    storage = f'sqlite:///{db}',
    load_if_exists = True
    )

[I 2024-04-19 15:28:19,672] Using an existing study with name 'rf_tuning' instead of creating a new one.


In [13]:
# Identify failed trials
failed_trials = [trial for trial in study.trials if trial.state == optuna.trial.TrialState.FAIL]

# Reset failed trials
for trial in failed_trials:
  print(f'Re-running failed trial with information: {trial}')
  study.enqueue_trial(trial.params)

In [14]:
print(study.trials_dataframe())

    number     value             datetime_start          datetime_complete  \
0        0  0.825312 2024-04-19 13:52:40.169129 2024-04-19 13:56:26.924971   
1        1  0.834542 2024-04-19 13:56:26.999858 2024-04-19 14:04:45.196781   
2        2  0.810970 2024-04-19 14:04:45.265386 2024-04-19 14:11:48.554184   
3        3  0.834499 2024-04-19 14:11:48.644293 2024-04-19 14:17:35.774610   
4        4  0.822725 2024-04-19 14:17:35.833677 2024-04-19 14:25:05.907430   
5        5  0.825208 2024-04-19 14:25:05.966398 2024-04-19 14:28:23.251419   
6        6  0.812845 2024-04-19 14:28:23.313014 2024-04-19 14:33:10.411635   
7        7  0.805330 2024-04-19 14:33:10.468418 2024-04-19 14:39:01.674776   
8        8  0.793130 2024-04-19 14:39:01.730552 2024-04-19 14:44:27.143704   
9        9  0.818818 2024-04-19 14:44:27.206438 2024-04-19 14:51:29.707202   
10      10  0.790750 2024-04-19 14:51:29.768578 2024-04-19 14:54:55.465463   
11      11  0.827097 2024-04-19 14:54:55.531647 2024-04-19 15:02

In [None]:
study.optimize(objective, n_trials=100)

  pid = os.fork()
[I 2024-04-19 15:35:39,436] Trial 16 finished with value: 0.7876780465673711 and parameters: {'n_estimators': 553, 'max_depth': 10, 'min_samples_split': 25, 'min_samples_leaf': 8}. Best is trial 1 with value: 0.834542043099507.
[I 2024-04-19 15:42:51,280] Trial 17 finished with value: 0.821441114625248 and parameters: {'n_estimators': 320, 'max_depth': 27, 'min_samples_split': 50, 'min_samples_leaf': 26}. Best is trial 1 with value: 0.834542043099507.
[I 2024-04-19 15:47:26,523] Trial 18 finished with value: 0.7947873037350924 and parameters: {'n_estimators': 266, 'max_depth': 13, 'min_samples_split': 8, 'min_samples_leaf': 37}. Best is trial 1 with value: 0.834542043099507.
[I 2024-04-19 15:56:34,326] Trial 19 finished with value: 0.8200913538352074 and parameters: {'n_estimators': 485, 'max_depth': 27, 'min_samples_split': 40, 'min_samples_leaf': 38}. Best is trial 1 with value: 0.834542043099507.
[I 2024-04-19 16:00:24,169] Trial 20 finished with value: 0.815630241

In [None]:
params = study.best_params

In [None]:
best_rf_model = RandomForestClassifier(
    **params,
    n_jobs = -1,
    verbose = 1
    )


best_rf_model = make_pipeline(
    column_transformer,
    best_rf_model
)

In [None]:
best_rf_model.fit(X_train, y_train)

  pid = os.fork()
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   23.3s
[Parallel(n_jobs=-1)]: Done 110 out of 110 | elapsed:   40.1s finished


In [None]:
evaluate_model(best_rf_model)

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    6.8s
[Parallel(n_jobs=2)]: Done 110 out of 110 | elapsed:   12.7s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    1.0s
[Parallel(n_jobs=2)]: Done 110 out of 110 | elapsed:    2.4s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    1.2s
[Parallel(n_jobs=2)]: Done 110 out of 110 | elapsed:    2.8s finished


===== (Training) Classification Report =====
              precision    recall  f1-score   support

 non-suicide     0.8225    0.8731    0.8470     74160
     suicide     0.8652    0.8121    0.8378     74367

    accuracy                         0.8426    148527
   macro avg     0.8438    0.8426    0.8424    148527
weighted avg     0.8439    0.8426    0.8424    148527

===== (Validation) Classification Report =====
              precision    recall  f1-score   support

 non-suicide     0.8199    0.8651    0.8419     18619
     suicide     0.8563    0.8088    0.8319     18513

    accuracy                         0.8370     37132
   macro avg     0.8381    0.8370    0.8369     37132
weighted avg     0.8380    0.8370    0.8369     37132

===== (Evaluation) Classification Report =====
              precision    recall  f1-score   support

 non-suicide     0.8212    0.8698    0.8448     23258
     suicide     0.8610    0.8098    0.8346     23157

    accuracy                         0.8399

In [None]:
joblib.dump(model, '/content/drive/MyDrive/Graduate/Research/SU24/Suicide/Models/Base/best_rf_model.pkl')

['/content/drive/MyDrive/Graduate/Research/SU24/Suicide/Models/Base/best_rf_model.pkl']

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_parallel_coordinate(study)

In [None]:
optuna.visualization.plot_slice(study, params=['n_estimators', 'max_depth', 'min_samples_split', 'min_samples_leaf'])

In [None]:
optuna.visualization.plot_param_importances(study)