## Environment Configuration

In [1]:
!pip install -U emoji
!pip install -U tqdm
!pip install -U transformers
!pip install -U kaleido
!pip install -U dash
!pip install -U optuna
!pip install -U scikit-learn

Collecting emoji
  Downloading emoji-2.11.0-py2.py3-none-any.whl (433 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m433.8/433.8 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.11.0
Collecting transformers
  Downloading transformers-4.40.0-py3-none-any.whl (9.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.20,>=0.19 (from transformers)
  Downloading tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.15.2
    Uninstalling tokenizers-0.15.2:
      Successfully uninstalled tokenizers-0.15.2
  Attempting uninstall

In [2]:
# General Dataloaders
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
# NLP
import re
import string
import emoji
from collections import Counter
from transformers import pipeline
# Data Visualization
import plotly.express as px
# Machine Learning
import torch
# Data Pipeline
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
# Tokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
# Models
from sklearn.ensemble import GradientBoostingClassifier
import joblib
# Hyperparameter Tuning
import optuna
from optuna.storages import JournalStorage, JournalFileStorage
import matplotlib.pyplot as plt
# Logistics
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
from timeit import default_timer as timer

In [3]:
dv = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(dv)

cpu


## Augmentation Functions

In [None]:
def clean_text(x: str) -> str:
  """
  Goals:
    1) Remove punctuation to make the TF-IDF dictionary more accurate
    2) Remove emojis, they are unnecessary here for training purposes. (We will use emoji data in another column :3)
  """
  clean_text = x.translate(str.maketrans('', '', string.punctuation))
  clean_text = emoji.replace_emoji(clean_text, replace='')
  return clean_text

In [None]:
def num_emojis(x: str) -> int:
  return emoji.emoji_count(x)

In [None]:
def average_words_per_sentence(message: str):
  """
  Unfortunately, due to the limitation of a non-standard ASCII table for emojis, getting the
  average words per second is not 100% accurate in certain cases.

  Please modify the regular expression for, 'sentences' to match your need if you re-use this code. Thanks :)
  """
  # Split the string based on {., !, ?} characters
  # print(f'Base Message: {message}')
  sentences = re.split('[.!?\u263a-\U0001f645\n]+', message.strip().replace('\n', ''))
  avg_words = 0
  for entry in sentences:
    # print(f'Entry: {entry}')
    words = re.split('[,;: ]', entry.strip())
    # print(f'Words: {words}')
    avg_words += len(words)
  return round(avg_words / len(sentences), 2)

In [None]:
sentiment = pipeline(
    task='sentiment-analysis',
    model= "cardiffnlp/twitter-roberta-base-sentiment-latest", #@param {type:"string"},
    tokenizer = "cardiffnlp/twitter-roberta-base-sentiment-latest", #@param {type:"string"}
    max_length=512,
    batch_size = 25000,
    truncation=True,
    padding=True,
    device=dv
)

## Model Training

In [4]:
messages = pd.read_parquet('/content/drive/MyDrive/Graduate/Research/SU24/Suicide/Code/Augments/UPDATED_Suicide_Detection.csv.parquet.gzip')
messages = messages.drop(['text'], axis=1)
messages.dropna(axis = 0)
messages.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232074 entries, 0 to 232073
Data columns (total 6 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Unnamed: 0                  232074 non-null  int64  
 1   clean_text                  232074 non-null  object 
 2   average_words_per_sentence  232074 non-null  float64
 3   sentiment                   232074 non-null  object 
 4   num_emojis                  232074 non-null  int64  
 5   class                       232074 non-null  object 
dtypes: float64(1), int64(2), object(3)
memory usage: 10.6+ MB


In [5]:
X = messages[['clean_text', 'average_words_per_sentence', 'sentiment', 'num_emojis']]
y = messages['class']
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    train_size = 0.8,
    random_state = 5
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train,
    y_train,
    train_size = 0.8,
    random_state = 21
)

In [6]:
tokenize = ['clean_text']
numerical = ['num_emojis', 'average_words_per_sentence']
categorical = ['sentiment']

In [7]:
tokenize_pipeline = Pipeline(
    steps = [
        ('text', TfidfVectorizer())
    ]
)

In [8]:
numerical_pipeline = Pipeline(
    steps = [
        ('impute', SimpleImputer(strategy='mean')),
        ('scale', StandardScaler()),
    ]
)

In [9]:
categorical_pipeline = Pipeline(
    steps = [
        ('impute', SimpleImputer(strategy = 'most_frequent')),
        ('one-hot-encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ]
)

In [10]:
column_transformer = ColumnTransformer(
    transformers = [
        ('num_pipeline', numerical_pipeline, numerical),
        ('cat_pipeline', categorical_pipeline, categorical),
        ('tfidf', tokenize_pipeline, 'clean_text')

    ],
    remainder = 'passthrough',
    n_jobs = -1,
    verbose = 1
)

In [11]:
clf = GradientBoostingClassifier(
    verbose = 1
)

model = make_pipeline(
    column_transformer,
    clf
)

In [12]:
model.fit(X_train, y_train)

  pid = os.fork()


      Iter       Train Loss   Remaining Time 
         1           1.3053           15.37m
         2           1.2381           13.93m
         3           1.1807           14.00m
         4           1.1296           13.86m
         5           1.0870           13.37m
         6           1.0505           13.35m
         7           1.0168           13.15m
         8           0.9887           12.98m
         9           0.9623           13.25m
        10           0.9398           13.04m
        20           0.7910           11.60m
        30           0.7157           10.15m
        40           0.6695            8.78m
        50           0.6383            7.26m
        60           0.6155            5.79m
        70           0.5972            4.34m
        80           0.5825            2.90m
        90           0.5699            1.45m
       100           0.5587            0.00s


In [13]:
joblib.dump(model, '/content/drive/MyDrive/Graduate/Research/SU24/Suicide/Models/Base/gbc_base_model.pkl')

['/content/drive/MyDrive/Graduate/Research/SU24/Suicide/Models/Base/gbc_base_model.pkl']

## Model Evaluation

In [14]:
model = joblib.load('/content/drive/MyDrive/Graduate/Research/SU24/Suicide/Models/Base/gbc_base_model.pkl')

In [15]:
def evaluate_model(model):
  y_predict_train = model.predict(X_train)
  y_predict_val = model.predict(X_val)
  y_predict_test = model.predict(X_test)
  target_names = model.classes_
  print(f'===== (Training) Classification Report =====\n{classification_report(y_train, y_predict_train, target_names = target_names, digits=4)}')
  print(f'===== (Validation) Classification Report =====\n{classification_report(y_val, y_predict_val, target_names = target_names, digits=4)}')
  print(f'===== (Evaluation) Classification Report =====\n{classification_report(y_test, y_predict_test, target_names = target_names, digits=4)}')


In [16]:
evaluate_model(model)

  pid = os.fork()


===== (Training) Classification Report =====
              precision    recall  f1-score   support

 non-suicide     0.8728    0.9203    0.8960     74160
     suicide     0.9160    0.8663    0.8905     74367

    accuracy                         0.8933    148527
   macro avg     0.8944    0.8933    0.8932    148527
weighted avg     0.8945    0.8933    0.8932    148527

===== (Validation) Classification Report =====
              precision    recall  f1-score   support

 non-suicide     0.8745    0.9163    0.8949     18619
     suicide     0.9116    0.8677    0.8891     18513

    accuracy                         0.8921     37132
   macro avg     0.8930    0.8920    0.8920     37132
weighted avg     0.8930    0.8921    0.8920     37132

===== (Evaluation) Classification Report =====
              precision    recall  f1-score   support

 non-suicide     0.8752    0.9174    0.8958     23258
     suicide     0.9128    0.8686    0.8902     23157

    accuracy                         0.8931

## Model Finetuning

In [17]:
# Step 1: Get the Column Transformer
column_transformer = model.steps[0][1]
print(column_transformer)

ColumnTransformer(n_jobs=-1, remainder='passthrough',
                  transformers=[('num_pipeline',
                                 Pipeline(steps=[('impute', SimpleImputer()),
                                                 ('scale', StandardScaler())]),
                                 ['num_emojis', 'average_words_per_sentence']),
                                ('cat_pipeline',
                                 Pipeline(steps=[('impute',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('one-hot-encoder',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse_output=False))]),
                                 ['sentiment']),
                                ('tfidf',
                                 Pipeline(steps=[('text', TfidfVectorizer())]),
                                 'clean_

In [18]:
# Step 2: Get the Random Forest Classifier
gbc_base_params = model.steps[1][1]
print(gbc_base_params.get_params())

{'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'log_loss', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'random_state': None, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 1, 'warm_start': False}


In [19]:
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
print(label_encoder.classes_)

['non-suicide' 'suicide']


In [20]:
def objective(trial):
  learning_rate = trial.suggest_float('learning_rate', 0.001, 0.3)
  n_estimators = trial.suggest_int('n_estimators', 100, 300)
  max_depth = trial.suggest_int('max_depth', 3, 5)
  min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
  min_samples_leaf = trial.suggest_int('min_samples_leaf', 2, 10)
  subsample = trial.suggest_float('subsample', 0.5, 1.0)


  gbc = GradientBoostingClassifier(
      learning_rate = learning_rate,
      n_estimators = n_estimators,
      max_depth = max_depth,
      min_samples_split = min_samples_split,
      min_samples_leaf = min_samples_leaf,
      subsample = subsample,
      verbose = 1
      )

  model = make_pipeline(
      column_transformer,
      gbc
  )

  score = cross_val_score(model, X_train, y_train_encoded, cv=5, scoring = 'f1')
  return score.mean()

In [21]:
db = '/content/drive/MyDrive/Graduate/Research/SU24/Suicide/Models/Checkpoints/gbc_tuning.db'

In [22]:
study = optuna.create_study(
    study_name = 'gbc_tuning',
    direction = 'maximize',
    sampler = optuna.samplers.RandomSampler(),
    storage = f'sqlite:///{db}',
    load_if_exists = True
    )

[I 2024-04-19 16:12:26,591] A new study created in RDB with name: gbc_tuning


In [23]:
# Identify failed trials
failed_trials = [trial for trial in study.trials if trial.state == optuna.trial.TrialState.FAIL]

# Reset failed trials
for trial in failed_trials:
  print(f'Re-running failed trial with information: {trial}')
  study.enqueue_trial(trial.params)

In [24]:
print(study.trials_dataframe())

Empty DataFrame
Columns: []
Index: []


In [None]:
study.optimize(objective, n_trials=100)

      Iter       Train Loss      OOB Improve   Remaining Time 
         1           1.3444           0.0420           34.50m
         2           1.3050           0.0382           35.26m
         3           1.2692           0.0358           34.56m
         4           1.2368           0.0337           34.34m
         5           1.2061           0.0306           34.45m
         6           1.1771           0.0279           33.72m
         7           1.1521           0.0281           33.75m
         8           1.1255           0.0227           33.50m
         9           1.1022           0.0228           34.29m
        10           1.0811           0.0228           33.97m
        20           0.9199           0.0113           31.90m
        30           0.8242           0.0052           30.03m
        40           0.7601           0.0122           28.38m
        50           0.7087           0.0047           27.00m
        60           0.6704           0.0070           25.59m
       

In [None]:
params = study.best_params

In [None]:
best_gbc_model = GradientBoostingClassifier(
    **params,
    )


best_gbc_model = make_pipeline(
    column_transformer,
    best_gbc_model
)

In [None]:
best_gbc_model.fit(X_train, y_train)

  pid = os.fork()
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   23.3s
[Parallel(n_jobs=-1)]: Done 110 out of 110 | elapsed:   40.1s finished


In [None]:
evaluate_model(best_gbc_model)

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    6.8s
[Parallel(n_jobs=2)]: Done 110 out of 110 | elapsed:   12.7s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    1.0s
[Parallel(n_jobs=2)]: Done 110 out of 110 | elapsed:    2.4s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    1.2s
[Parallel(n_jobs=2)]: Done 110 out of 110 | elapsed:    2.8s finished


===== (Training) Classification Report =====
              precision    recall  f1-score   support

 non-suicide     0.8225    0.8731    0.8470     74160
     suicide     0.8652    0.8121    0.8378     74367

    accuracy                         0.8426    148527
   macro avg     0.8438    0.8426    0.8424    148527
weighted avg     0.8439    0.8426    0.8424    148527

===== (Validation) Classification Report =====
              precision    recall  f1-score   support

 non-suicide     0.8199    0.8651    0.8419     18619
     suicide     0.8563    0.8088    0.8319     18513

    accuracy                         0.8370     37132
   macro avg     0.8381    0.8370    0.8369     37132
weighted avg     0.8380    0.8370    0.8369     37132

===== (Evaluation) Classification Report =====
              precision    recall  f1-score   support

 non-suicide     0.8212    0.8698    0.8448     23258
     suicide     0.8610    0.8098    0.8346     23157

    accuracy                         0.8399

In [None]:
joblib.dump(model, '/content/drive/MyDrive/Graduate/Research/SU24/Suicide/Models/Base/best_gbc_model.pkl')

['/content/drive/MyDrive/Graduate/Research/SU24/Suicide/Models/Base/best_rf_model.pkl']

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_parallel_coordinate(study)

In [None]:
optuna.visualization.plot_slice(study, params=['learning_rate', 'n_estimators', 'min_samples_split', 'min_samples_leaf'])

In [None]:
optuna.visualization.plot_param_importances(study)