In [1]:
!pip install iterative-stratification
!apt-get install libhunspell-dev
!apt-get install hunspell-en-us
!pip install urlextract
!pip install emot
!pip install hunspell
!pip install pyspellchecker
!pip install gputil

Collecting iterative-stratification
  Downloading iterative_stratification-0.1.9-py3-none-any.whl.metadata (1.3 kB)
Downloading iterative_stratification-0.1.9-py3-none-any.whl (8.5 kB)
Installing collected packages: iterative-stratification
Successfully installed iterative-stratification-0.1.9



The following additional packages will be installed:
  dictionaries-common hunspell-en-us libhunspell-1.7-0 libtext-iconv-perl
Suggested packages:
  ispell | aspell | hunspell wordlist hunspell openoffice.org-hunspell
  | openoffice.org-core
The following NEW packages will be installed:
  dictionaries-common hunspell-en-us libhunspell-1.7-0 libhunspell-dev
  libtext-iconv-perl
0 upgraded, 5 newly installed, 0 to remove and 38 not upgraded.
Need to get 896 kB of archives.
After this operation, 3,130 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/main amd64 libtext-iconv-perl amd64 1.7-7build3 [14.3 kB]
Get:2 http://archive.ubuntu.com/u

In [2]:
import warnings
import gc
import hunspell
import re
import spacy
import joblib
import torch
import psutil
import GPUtil
import optuna
import numpy as np
import pandas as pd
from functools import lru_cache
from datetime import datetime
from bs4 import BeautifulSoup
from transformers import logging as hf_logging
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.base import ClassifierMixin, clone
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.multioutput import ClassifierChain
from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import f1_score, accuracy_score, make_scorer, precision_recall_curve
from sklearn.metrics import precision_recall_curve
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import FunctionTransformer, StandardScaler
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from emot import emot
from emot.emo_unicode import EMOTICONS_EMO
from collections import Counter
from urlextract import URLExtract
from spacy.language import Language
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import mutual_info_classif, SelectKBest, f_classif

2025-10-06 01:24:17.863286: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1759713858.043955      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1759713858.097079      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
def log(msg):
    current_datetime = datetime.now()
    current_time_str = current_datetime.strftime("%H:%M:%S")
    print(f'{current_time_str}: {msg}')

In [4]:
concatenated_pattern = re.compile(r"(\w+[^\s\w]+\w{3,}[^\s\w]*)+(?!\s)")
separators_pattern = re.compile(r'[^\w\s]+')
def fix_concatenated_words(X):
    result = []
    for text in X:
        matches = list(concatenated_pattern.finditer(text))
        for m in reversed(matches):
            problematic_sub = m.group(0)
            separators = separators_pattern.findall(problematic_sub)
            separators = set(separators)
            fixed_sub = problematic_sub
            for s in separators:
                escaped_s = re.escape(s)
                fixed_sub = re.sub(escaped_s, f'{s} ', fixed_sub)
            text = text.replace(problematic_sub, fixed_sub)
        result.append(text.strip())
    return result

In [5]:
class SbertTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, model_name='sentence-transformers/all-mpnet-base-v2'):
        self.model_name = model_name
        self.model = SentenceTransformer(model_name, device="cuda:0")
        self.tokenizer = self.model.tokenizer
        self.chunk_token_size = self.model.max_seq_length - 50
        log(f'Max seq length: {self.chunk_token_size}')
        self.overlap = int(self.chunk_token_size * 0.2)
        self.splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.chunk_token_size,
            chunk_overlap=self.overlap,
            length_function=self._token_length,
            # Punctuation without spaces. Could be a sign of censorship
            separators=["\n\n", "\n", ",", " ", "!", ".", "?", "'"]
        )

    def _token_length(self, text):
        return len(self.tokenizer.encode(text, add_special_tokens=True))
    
    def _chunk_text_by_tokens(self, text):
        # Token indices sequence length is longer...
        # Disable this message here because we're not going to run this sequence through the model
        logging_level = hf_logging.get_verbosity()
        hf_logging.set_verbosity_error()
        n_tokens = self._token_length(text)
        sentences = None
        if n_tokens > self.chunk_token_size:
            sentences = re.split(r'(?<=[.!?])\s+', text)
        else:
            return [text]
        chunks = []
        for sent in sentences:
            chunks.extend(self.splitter.split_text(sent))
        self.tokenizer.deprecation_warnings.pop(
            "sequence-length-is-longer-than-the-specified-maximum", 
            None
        )
        # And enable logging again because we want to know if there is a long chunk
        hf_logging.set_verbosity(logging_level)
        return chunks

    def _agg_embeddings(self, chunks, embeddings):
        lengths = np.array([len(c) for c in chunks], dtype=float)
        weights = lengths / lengths.sum()
        return (embeddings * weights[:, None]).sum(axis=0)
    
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        log('Start SBERT transform')
        X_encoded = []
        longest_chunk = 0
        chunk_with_problem = None
        with torch.no_grad():
            for x in X.to_list():
                chunks = self._chunk_text_by_tokens(x)
                for c in chunks:
                    c_len = self._token_length(c)
                    if c_len > longest_chunk:
                        longest_chunk, chunk_with_problem = c_len, c
                if not chunks:
                    emb_dim = self.model.get_sentence_embedding_dimension()
                    X_encoded.append(np.zeros(emb_dim))
                    continue
                    
                chunk_embeddings = self.model.encode(
                    chunks,
                    device="cuda",
                    batch_size=512,
                    convert_to_numpy=True, 
                    show_progress_bar=False
                )
                wegihted_embeddings = self._agg_embeddings(chunks, chunk_embeddings)
                X_encoded.append(wegihted_embeddings)
        log(f'Chunk with potential problem {chunk_with_problem}')
        self.model.cpu()
        gc.collect()
        torch.cuda.empty_cache()
        self.model.to("cuda:0")
        log('Finish SBERT transform')
        return np.vstack(X_encoded)

In [6]:
class CachingSpellChecker:
    def __init__(self):
        self.hunspell = hunspell.HunSpell(
            '/usr/share/hunspell/en_US.dic',
            '/usr/share/hunspell/en_US.aff'
        )

    def correct_words(self, words):
        unique_words = set(words)
        corrections = [self._correct_word(w) for w in unique_words]
        corrections = dict(corrections)
        return [corrections.get(w, w) for w in words]

    @lru_cache(maxsize=10000)
    def _correct_word(self, word):
        is_correct = self.hunspell.spell(word)
        corrected = word
        if not is_correct:
            suggestions = self.hunspell.suggest(word)
            corrected = suggestions[0] if len(suggestions) > 0 else word
        return word, corrected

In [7]:
_spell_checker = CachingSpellChecker()
class ExtraFeatures(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.repeat_pattern = re.compile(r'(\w)\1{2,}', re.IGNORECASE)
        self.emot_obj = emot()
        self.emot_meanings = set(EMOTICONS_EMO.values())
        self.url_extractor = URLExtract()
        self.feature_names_ = [
            "length", "upcase_rate", "exc_mark_rate", "q_mark_rate", "dots_rate",
            "new_lines_rate", "median_sentence_len", "sentences_count", "urls_counter",
            "censured", "compression"
        ] + sorted(self.emot_meanings)

    def get_feature_names_out(self, input_features=None):
        return self.feature_names_
        
    def fit(self, X, y=None):
        return self

    def comma_fix(self, text):
        wrong_emots = re.findall(r',-?[)(}{\[\]]', text)
        result = text
        for emotion in wrong_emots:
            new_emotion = emotion.replace(',', ';')
            result = result.replace(emotion, new_emotion)
        return result

    def replace_urls(self, text):
        urls = self.url_extractor.find_urls(text)
        urls_counter = len(urls)
        result = text
        for url in urls:
            result = result.replace(url, '[l]')
        return result, urls_counter

    def add_base_stat(self, text, sentences_count):
        sentences_count = max(1, sentences_count)
        text_len = max(1, len(text))
        exc_marks = text.count('!')
        q_marks = text.count('?')
        dots = text.count('.')
        new_lines = text.count('\n')
        upcase = sum(1 for c in text if c.isupper())
        
        return {
            'length': text_len,
            'upcase_rate': upcase / sentences_count,
            'exc_mark_rate': exc_marks / sentences_count,
            'q_mark_rate': q_marks / sentences_count,
            'dots_rate': dots / sentences_count,
            'new_lines_rate': new_lines / sentences_count
        }

    def add_sentences_stat(self, doc):
        sentences_len = np.array([len(sent.text.strip()) for sent in doc.sents])
        sentences_len = sentences_len[sentences_len > 0]
        return {
            'median_sentence_len': np.nan_to_num(np.median(sentences_len)),
            'sentences_count': len(sentences_len)
        }

    def add_emoticons_stat(self, text):
        emot_counts = Counter({item: 0 for item in self.emot_meanings})
        text = self.comma_fix(text)
        detected = self.emot_obj.emoticons(text)
        if detected:
            emot_counts.update(detected['mean'])
        return emot_counts

    def add_censorship_stat(self, doc):
        tokens = [token.lemma_.lower() for token in doc if not token.is_punct]
        corrected_tokens = [
            self.repeat_pattern.sub(r'\1\1', token)
            for token in tokens
        ]
        corrected_tokens = _spell_checker.correct_words(corrected_tokens)
        stat = []
        for t, ct in zip(tokens, corrected_tokens):
            not_word_symbols = re.findall(r"[^\w\s'-:]+", t)
            compression = max(0, len(t) - len(ct))
            stat.append([len(not_word_symbols) > 0, compression])
        if len(stat) == 0:
            stat.append([False, 0])
        stat = list(map(list, zip(*stat)))
        return {
            'censured': any(stat[0]),
            'compression': sum(stat[1])
        }
        
    def transform(self, X):
        feats = []
        log('Extra features extraction start')
        total_messages = len(X)
        milestones = [0.25, 0.5, 0.75]
        real_milestones = [int(total_messages * m) for m in milestones]
        for i, doc in enumerate(X):
            if i in real_milestones:
                j = real_milestones.index(i)
                log(f'Extra features finalized {milestones[j]} of total records')
            text = str(doc)
            text, urls_counter = self.replace_urls(text)
            sentence_feats = self.add_sentences_stat(doc)
            base_feats = self.add_base_stat(text, sentence_feats['sentences_count'])
            emot_feats = self.add_emoticons_stat(text)
            censorship_feats = self.add_censorship_stat(doc)

            feats_row = {
                **base_feats,
                **sentence_feats,
                "urls_counter": urls_counter,
                **emot_feats,
                **censorship_feats
            }
            row = [feats_row.get(name, 0) for name in self.feature_names_]
            feats.append(row)
            del (feats_row, text, urls_counter, sentence_feats,
                 base_feats, emot_feats, censorship_feats)
        gc.collect()
        log('Extra features extraction finish')
        result = pd.DataFrame(np.array(feats), columns=self.feature_names_)
        return result

In [8]:
class SpacyTokenizer(BaseEstimator, TransformerMixin):
    _nlp_model = None
    
    def __init__(self):
        self.nlp = self._get_nlp_model()

    @staticmethod
    @Language.component("newline_sentencizer")
    def newline_sentencizer(doc):
        for token in doc:
            if '\n' in token.text and token.i > 0:
                doc[token.i].is_sent_start = True
        return doc
        
    @classmethod
    def _get_nlp_model(cls):
        if cls._nlp_model is None:
            cls._nlp_model = spacy.load('en_core_web_sm', disable=["ner", "textcat"])
            cls._nlp_model.add_pipe('newline_sentencizer', before="parser")
        return cls._nlp_model

    def fit(self, X, y=None):
        return self
        
    def transform(self, X):
        log('Start spaCy preprocessing...')
        spacy.require_gpu()
        with torch.no_grad():
            docs = list(self.nlp.pipe(X, batch_size=5000, n_process=1))
        spacy.require_cpu()
        gc.collect()
        torch.cuda.empty_cache()
        spacy.require_gpu()
        log('SpaCy preprocessing finished')
        return docs

In [9]:
class ThresholdOptimizer(BaseEstimator, ClassifierMixin):
    def __init__(self, base_estimator):
        self.base_estimator = base_estimator

    def fit(self, X, y):
        y_prob = self.base_estimator.predict_proba(X)[:, 1]
        precision, recall, thr = precision_recall_curve(y, y_prob)
        f1 = 2 * precision * recall / (precision + recall + 1e-8)
        self.threshold_ = thr[np.argmax(f1)]
        return self

    def predict(self, X):
        y_prob = self.base_estimator.predict_proba(X)[:, 1]
        return (y_prob >= self.threshold_).astype(int)

    def predict_proba(self, X):
        return self.base_estimator.predict_proba(X)

In [10]:
def print_consensus(consensus):
    print(f"\n=== Top features ===")
    print("Feature" + " " * 25 + "Rating")
    print("-" * 40)
    for feature, rating in consensus:
        print(f"{feature:<30} {rating:>8.4f}")

In [11]:
def correlation_selection(X, y, feature_names, top_k=10):
    combined_df = pd.concat([X, y], axis=1)
    corr_matrix = combined_df.corr(method='pearson')
    results = {}
    label_correlations = corr_matrix['class'][feature_names]
    
    top_features = label_correlations.abs().sort_values(ascending=False).head(top_k)
    
    results = [
        (feature, label_correlations[feature]) 
        for feature in top_features.index
    ]
    return results

In [12]:
def mutual_information_selection(X, y, feature_names, top_k=10):
    results = {}
    
    mi_scores = mutual_info_classif(X, y, random_state=42)
    top_indices = np.argsort(mi_scores)[::-1][:top_k]
    results = [
        (feature_names[i], mi_scores[i]) 
        for i in top_indices
    ]
    
    return results

In [13]:
def random_forest_importance(X, y, feature_names, top_k=10):
    results = {}
    
    rf = RandomForestClassifier(
        n_estimators=100, 
        random_state=42,
        max_depth=10,
        n_jobs=-1
    )
    rf.fit(X, y)
    
    importances = rf.feature_importances_
    top_indices = np.argsort(importances)[::-1][:top_k]
    
    results = [
        (feature_names[i], importances[i]) 
        for i in top_indices
    ]
    
    return results

In [14]:
def k_best_selection(X, y, feature_names, top_k=100):
    results = {}
    selector = SelectKBest(score_func=f_classif, k=10)
    X_selected = selector.fit_transform(X, y)
    scores = selector.scores_
    mask = selector.get_support()
    feat_names = X.columns[mask]
    feat_scores = scores[mask]
    results = list(zip(feat_names, feat_scores))
    return results

In [15]:
def comprehensive_feature_analysis(X, y, feature_names, top_k=100):
    methods = {
        'Mutual Information': mutual_information_selection,
        'Random Forest': random_forest_importance,
        'Correlation': correlation_selection,
        'K Best': k_best_selection,
    }
    all_results = {}
    for method_name, method_func in methods.items():
        try:
            all_results[method_name] = method_func(X, y, feature_names, top_k)
        except Exception as e:
            print(f"Error in {method_name}: {e}")
            continue
    
    return all_results

In [16]:
def compare_methods_consensus(all_results, top_k=5):
    feature_votes = {}
    
    for method_name, method_results in all_results.items():
        for rank, (feature, score) in enumerate(method_results[:top_k]):
            if feature not in feature_votes:
                feature_votes[feature] = 0
            feature_votes[feature] += (top_k - rank) / top_k
    
    consensus = sorted(feature_votes.items(), key=lambda x: x[1], reverse=True)
    
    return consensus

In [17]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._important_features = None

    def get_feature_names_out(self, input_features=None):
        return self._important_features.copy()
        
    def fit(self, X, y=None):
        with warnings.catch_warnings():
            warnings.filterwarnings('ignore', category=UserWarning, module='sklearn')
            extra_feature_names = list(X.columns)
            results = comprehensive_feature_analysis(X, y_train, extra_feature_names)
            important_features = []
            consensus = compare_methods_consensus(results, top_k=15)
            features = [feature for feature, rating in consensus]
            important_features.extend(features)
            print_consensus(consensus)
        self._important_features = list(set(important_features))
        return self
        
    def transform(self, X):
        return X[self._important_features]

In [18]:
ram = psutil.virtual_memory()
gpus = GPUtil.getGPUs()

def mem_info():
    log(f"RAM total: {ram.total / 1e9:.2f} GB, available: {ram.available / 1e9:.2f} GB")
    for gpu in gpus:
        log(f"VRAM total: {gpu.memoryTotal} MB, available: {gpu.memoryFree} MB")

In [19]:
repeat_pattern = re.compile(r'(\w)\1{2,}', re.IGNORECASE)

def typos_processor(docs):
    result = np.empty((len(docs), 1), dtype=object)
    log('Start SpellChecker preprocessing...')
    for i, doc in enumerate(docs):
        tokens = []
        word_tokens = []
        for token in doc:
            if not token.is_stop and not token.is_punct and token.lemma_.strip():
                token = repeat_pattern.sub(r'\1\1', token.lower_)
                word_tokens.append(token)
            tokens.append(str(token))
        corrected_words = _spell_checker.correct_words(word_tokens)
        mapper = dict(zip(word_tokens, corrected_words))
        tokens = [mapper.get(t, t) for t in tokens]
        tokens = [proc_tok + orig_tok.whitespace_ for proc_tok, orig_tok in zip(tokens, doc)]
        result[i, 0] = ''.join(tokens)
        del doc, mapper, corrected_words, tokens
        if i % 5000 == 0:
            # mem_info()
            gc.collect()
    gc.collect()
    log('SpellChecker preprocessing finished')
    return result

In [20]:
log('Start')

01:24:32: Start


In [21]:
le = LabelEncoder()
df = pd.read_csv('/kaggle/input/suicide-watch/Suicide_Detection.csv')
# df = df.sample(n=1000)
X, y = df[['text']], df['class']
y = le.fit_transform(y)
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
log(f'LabelEncoder mapping: {le_name_mapping}')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
log(f'X_train shape: {X_train.shape}')

01:24:36: LabelEncoder mapping: {'non-suicide': 0, 'suicide': 1}
01:24:36: X_train shape: (162451, 1)


In [22]:
_spell_checker = CachingSpellChecker()
tokenizer = SpacyTokenizer()
features_extractor = ExtraFeatures()
extra_feature_names = features_extractor.get_feature_names_out()

In [23]:
extra_features_pipeline = Pipeline([
    ("spacy", SpacyTokenizer()), 
    ("extact", ExtraFeatures()),
    ("select", FeatureSelector()),
    ("scale", StandardScaler())
])
text_pipeline = Pipeline([
    ("splitter", FunctionTransformer(fix_concatenated_words, validate=False)),
    ("spacy", SpacyTokenizer()), 
    ("typos", FunctionTransformer(typos_processor, validate=False)),
])

In [24]:
preprocessor = ColumnTransformer([
    ('text', text_pipeline, 'text'),
    ("extra_features", extra_features_pipeline, 'text')
])

In [25]:
# X_train_transformed = preprocessor.fit_transform(X_train)
# preprocessor.fit(X_train)
# joblib.dump(preprocessor, 'base_text_preprocessor.joblib')

In [26]:
# feature_selector = preprocessor.named_transformers_['extra_features'].named_steps['select']
# feature_names = feature_selector.get_feature_names_out()
# feature_names.insert(0, 'text')
# X_train_transformed = pd.DataFrame(X_train_transformed, columns=feature_names)
# X_train_transformed.to_csv('X_train_transformed.csv')

In [27]:
# X_test_transformed = preprocessor.transform(X_test)
# X_test_transformed = pd.DataFrame(X_test_transformed, columns=feature_names)
# X_test_transformed.to_csv('X_test_transformed.csv')

In [28]:
X_train_transformed = pd.read_csv('/kaggle/input/suicide-watch-extra/X_train_transformed.csv')
X_test_transformed = pd.read_csv('/kaggle/input/suicide-watch-extra/X_test_transformed.csv')

In [29]:
skf_for_threshold = StratifiedKFold(n_splits=8, shuffle=True, random_state=42)
test_idx, threshold_idx = next(skf_for_threshold.split(X_test_transformed, y_test))
X_test_transformed, X_threshold = X_test_transformed.iloc[test_idx], X_test_transformed.iloc[threshold_idx]
y_test, y_threshold = y_test[test_idx], y_test[threshold_idx]

In [30]:
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 200, 1200, step=100),
        "max_depth": trial.suggest_int("max_depth", 2, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.5, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-2, 100.0, log=True),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-2, 100.0, log=True),
        "use_label_encoder": False,
        "eval_metric": "logloss",
        'tree_method': 'hist',
        "n_jobs": 2,
    }
    xgb = XGBClassifier(**params)
    scores = cross_val_score(
        xgb, X_train_preprocessed, y_train,
        cv=skf,
        scoring='accuracy'
    )

    return scores.mean()

In [31]:
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('sbert_vectorize', SbertTransformer(), 'text')
#     ],
#     remainder='passthrough'
# )
# X_train_preprocessed = preprocessor.fit_transform(X_train_transformed)

In [32]:
# study = optuna.create_study(direction="maximize")
# study.optimize(objective, n_trials=30)

In [33]:
# log(f'Best accuracy: {study.best_value}')
# log(f'Best params: {study.best_params}')
# best_params = study.best_params

In [34]:
best_params = {
    'n_estimators': 1100, 
    'max_depth': 7, 
    'learning_rate': 0.11845310258701165, 
    'subsample': 0.8577362914728137, 
    'reg_lambda': 4.780328173176433, 
    'reg_alpha': 0.22787849319324718
}

In [35]:
best_params['use_label_encoder'] = False
best_params['eval_metric'] = 'logloss'
best_params['tree_method'] = 'hist'
best_params['n_jobs'] = 2

In [36]:
preprocessor = ColumnTransformer(
    transformers=[
        ('sbert_vectorize', SbertTransformer(), 'text')
    ],
    remainder='passthrough'
)
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("clf", XGBClassifier(**best_params))
])
# pipeline.fit(X_train, y_train)
pipeline = joblib.load('/kaggle/input/suicide-watch-extra/sbert_classifier.joblib')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

01:24:50: Max seq length: 334


In [37]:
# joblib.dump(pipeline, 'sbert_classifier.joblib')

In [38]:
y_pred = pipeline.predict(X_test_transformed)

01:24:57: Start SBERT transform
01:50:04: Chunk with potential problem  :(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((

In [39]:
print(accuracy_score(y_test, y_pred))

0.9526756401838476


In [40]:
optimized_classifier = ThresholdOptimizer(pipeline)
optimized_classifier.fit(X_threshold, y_threshold)

01:50:07: Start SBERT transform
01:53:50: Chunk with potential problem feel guilty when i read this sub.alt account because of obvious reasons.
sorry for the long post. told at bottom.

I doNT really know how to say this properly. I'll start by saying that i'm from a third world country. In my country it is normal for parents to have high expectations from their children and want them to become engineers doctors, etc. Same applied to my life sadly. Since i performed pretty well in high school my parents had high expectations from me however lets just say i screwed it up from there. I have been barely passing my exams for the last 5 years and now finally i failed in my last semester of graduation. I held on to my sanity till that point through all the taunts thrown at me by parents, society, etc. However now its becoming almost unbearable since i'm a "failure". My dad looks at me with utter disgust because he caNT hold his head high in society anymore. All this led to me feeling that i'

In [41]:
joblib.dump(pipeline, 'optimized_sbert_classifier.joblib')

['optimized_sbert_classifier.joblib']

In [42]:
y_pred = optimized_classifier.predict(X_test_transformed)

01:53:53: Start SBERT transform
02:19:09: Chunk with potential problem  :(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((

In [43]:
print(accuracy_score(y_test, y_pred))

0.9491792514773474
