In [1]:
import tensorflow as tf         
import pandas as pd
import json
import re
import numpy as np
from tqdm import tqdm
import os

In [2]:
# Avoid OOM errors by setting GPU Memory Consumption Growth
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus: 
    tf.config.experimental.set_memory_growth(gpu, True)

In [3]:
len(gpus)

1

In [4]:
data = pd.read_csv("new_final.csv",encoding="latin-1")

In [5]:
data

Unnamed: 0,text,class
0,Ex Wife Threatening SuicideRecently I left my ...,1
1,Am I weird I don't get affected by compliments...,0
2,Finally 2020 is almost over... So I can never ...,0
3,i need helpjust help me im crying so hard,1
4,"IÃ¢â¬â¢m so lostHello, my name is Adam (16) ...",1
...,...,...
233965,"""Living has become unbearable. The pain is too...",1
233966,"""The darkness and despair have consumed every ...",1
233967,"""I can't see any way out of this overwhelming ...",1
233968,"""The relentless cycle of despair makes every d...",1


In [6]:
data.dropna()

Unnamed: 0,text,class
0,Ex Wife Threatening SuicideRecently I left my ...,1
1,Am I weird I don't get affected by compliments...,0
2,Finally 2020 is almost over... So I can never ...,0
3,i need helpjust help me im crying so hard,1
4,"IÃ¢â¬â¢m so lostHello, my name is Adam (16) ...",1
...,...,...
233965,"""Living has become unbearable. The pain is too...",1
233966,"""The darkness and despair have consumed every ...",1
233967,"""I can't see any way out of this overwhelming ...",1
233968,"""The relentless cycle of despair makes every d...",1


In [7]:
data = data[~data["class"].isna()]

In [8]:
data.isnull().sum()

text     0
class    0
dtype: int64

In [9]:
data = data[data["class"]!=" Pepe"]

In [10]:
data['class'].value_counts()

class
1    117194
0    116751
Name: count, dtype: int64

In [11]:
text = data["text"].to_list()
sentiment = data["class"]

In [12]:


class Tokenizer:
    def __init__(self, vocab_path, max_length=100) -> None:
        with open(vocab_path, "r") as stream:
            self.vocab = json.load(stream)
        self.len = self.vocab["vocab_size"]
        self.word_to_index = self.vocab["word_to_index"]
        self.index_to_word = self.vocab["index_to_word"]
        self.pad_id = self.word_to_index["<PAD>"]
        self.start_id = self.word_to_index["<S>"]
        self.end_id = self.word_to_index["</S>"]
        self.max_length = max_length
        self.negate = self.word_to_index['negate']

    def __len__(self):
        return self.len

    def combos(self, s, first=False):
        if not s:
            return
        length = len(s)
        if not first:
            yield [s]
        for i in range(1, length):
            for c in self.combos(s[: length - i]):
                yield c + [s[length - i:]]

    def get_indx(self, combos: list):
        indx_of_combo = -1
        for i, combo in enumerate(combos):
            cond = True
            for ele in combo:
                if ele not in self.word_to_index:
                    cond = False
                    break
            if cond:
                indx_of_combo = i
                break
        indx = []
        if indx_of_combo != -1:
            for ele in combos[indx_of_combo]:
                indx.append(self.word_to_index[ele])
        return indx

    def split_word(self, word: str):
        combos = []
        for c in self.combos(word, True):
            size = len(c)
            c[size - 1] = c[size - 1] + "</w>"
            combos.append(c)
        combos = sorted(combos, key=lambda x: len(x))
        indx = self.get_indx(combos)
        return indx

    def create_pad(self, x: list):
        max = self.max_length
        if isinstance(x[0], int):
            while len(x) < max:
                x.append(self.word_to_index["<PAD>"])
        else:
            for i, line in enumerate(x):
                if len(line) > max:
                    x[i] = x[i][:max]
                while len(line) < max:
                    x[i].append(self.word_to_index["<PAD>"])
        return x

    def handle_negations(self, text):
        # Expand contractions first
        #text = contractions.fix(text)
        try :
            # Handle negations
            pattern = re.compile(r"\b(?:not|no|never|don't|doesn't|can't|won't|shouldn't|couldn't|wouldn't|didn't|isn't|aren't|wasn't|weren't|donot|doesnot|dont|doesnt|cannot|wont|shouldnt|couldnt|wouldnt|didnt|isnt|arent|wasnt|werent)\b[\w\s]*")

            def replace_negation(match):
                words = match.group(0).split()
                words = [words[0], "negate"] + words[1:]
                return ' '.join(words)
        
            processed_text = pattern.sub(replace_negation, text)        
            return processed_text
        except Exception as e:
            print(e)

    def encode(self, x: list):
        encoded_x = []
        if isinstance(x, list):
            for i in tqdm(range(len(x))):
                line = self.handle_negations(x[i])
                encoded_line = [self.word_to_index["<S>"]]
                for word in filter(lambda x: x not in [None, ""], re.split("\s+|(\W{1})", line)):
                    if len(word) > 10:
                        continue
                    if len(re.findall("\W", word)) == 0:
                        word = f"{word}</w>"
                    if word in self.word_to_index:
                        encoded_line.append(self.word_to_index[word])
                    else:
                        y = self.split_word(word.replace("</w>", ""))
                        for indx in y:
                            encoded_line.append(indx)
                encoded_line.append(self.word_to_index["</S>"])
                encoded_x.append(encoded_line)
            encoded_x = self.create_pad(encoded_x)

        else:
            x = self.handle_negations(x)
            word_count = 0
            encoded_x.append(self.word_to_index["<S>"])
            for word in filter(lambda x: x not in [None, ""], re.split("\s+|(\W{1})", x)):
                if len(word) > 10:
                    continue
                if len(re.findall("\W", word)) == 0:
                    word = f"{word}</w>"
                if word in self.word_to_index:
                    encoded_x.append(self.word_to_index[word])
                else:
                    y = self.split_word(word.replace("</w>", ""))
                    for indx in y:
                        encoded_x.append(indx)
            encoded_x.append(self.word_to_index["</S>"])
            encoded_x = self.create_pad(encoded_x)

        return np.array(encoded_x)

    def decode_step(self, x):
        y = []
        for indx in x:
            word = self.index_to_word[str(int(indx))]
            if word.startswith("NOT_"):
                word = word.replace("NOT_", "")
            y.append(word)
            
        pop_indx = []
        for i, word in enumerate(y):
            if word in ["<S>", "</S>", "<PAD>", "</w>"]:
                pop_indx.append(i)
                continue
            elif len(re.findall("\W", word.replace("</w>", ""))):
                y[i - 1] = y[i - 1] + y[i]
                pop_indx.append(i)
                continue
            elif not word.endswith("</w>") and i < len(y) - 1:
                y[i + 1] = y[i] + y[i + 1]
                pop_indx.append(i)
                continue
            y[i] = word.replace("</w>", "")
        for i in reversed(pop_indx):
            y.pop(i)
        return " ".join(y)

    def decode(self, x):
        if isinstance(x, np.ndarray):
            x = x.tolist()

        if isinstance(x[0], list):
            y = []
            for line in x:
                y.append(self.decode_step(line))
            return y
        else:
            return self.decode_step(x)



In [13]:
# data['text'] = data['text'].astype(str)
# text = data['text'].to_list()
tokenizer = Tokenizer('vocab.json')

# Preprocess text and tokenize
#embeded = tokenizer.encode(text)



In [14]:
#np.save('embeded.npy',embeded)

In [15]:
embeded = np.load('embeded.npy')

In [16]:
embed = embeded[:1000]

In [17]:
len(embed)

1000

In [18]:
tokenizer = Tokenizer('vocab.json')
len(tokenizer)

10223

In [19]:
voc_size = len(tokenizer)

In [20]:
sent_length = 100

In [21]:
embeding_features = 768

In [22]:
# import numpy as np
# X_final=np.array(embeded)
# y_final=np.array(data["class"]).astype(int)

In [23]:
import numpy as np
X_final=np.array(embed)
y_final=np.array(data["class"][:1000]).astype(int)

In [24]:
X_final.shape,y_final.shape

((1000, 100), (1000,))

In [25]:
from transformers import TFBertForSequenceClassification
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [26]:
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.2, random_state=0)

In [27]:
y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)

In [28]:
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

In [30]:
batch_size = 5

In [31]:
train_dataset = tf.data.Dataset.from_tensor_slices(({'input_ids': X_train}, y_train))
val_dataset = tf.data.Dataset.from_tensor_slices(({'input_ids': X_test}, y_test))
# Batch and prefetch the datasets
train_dataset = train_dataset.shuffle(len(X_train)).batch(batch_size).prefetch(tf.data.AUTOTUNE)
val_dataset = val_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)


In [32]:

model.fit(train_dataset, epochs=9, validation_data=val_dataset)

Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9


<keras.callbacks.History at 0x24889d33ee0>

In [None]:
logits = predictions.logits

In [None]:
probabilities = tf.nn.sigmoid(logits).numpy()

In [None]:
threshold = 0.5
predicted_labels = (probabilities >= threshold).astype(int)

In [None]:
y_test.shape

(46789, 1)

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,predicted_labels)


array([[20738,  2613],
       [ 1672, 21766]], dtype=int64)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,predicted_labels)

0.9084186454081088

In [None]:
q="Having no thoughts"

In [None]:
tokenizer = Tokenizer('vocab.json')

single_text = q

encoded_text = np.array(tokenizer.encode([single_text]))

decoded_text = tokenizer.decode(encoded_text)

print(decoded_text)
print(encoded_text)


100%|██████████| 1/1 [00:00<00:00, 941.69it/s]

['Having no negate thoughts']
[[   0  831 6624 6552 8884 9042    1    2    2    2    2    2    2    2
     2    2    2    2    2    2    2    2    2    2    2    2    2    2
     2    2    2    2    2    2    2    2    2    2    2    2    2    2
     2    2    2    2    2    2    2    2    2    2    2    2    2    2
     2    2    2    2    2    2    2    2    2    2    2    2    2    2
     2    2    2    2    2    2    2    2    2    2    2    2    2    2
     2    2    2    2    2    2    2    2    2    2    2    2    2    2
     2    2]]





In [None]:
input_ids = encoded_text
attention_mask = np.ones_like(input_ids)

In [None]:
inputs = {'input_ids': input_ids, 'attention_mask': attention_mask}


In [None]:
pred = model(inputs, training=False)


In [None]:
logits = pred.logits


In [None]:
prob = tf.nn.sigmoid(logits).numpy()

In [None]:
threshold = 0.5
pred_label = (prob >= threshold).astype(int)

In [None]:
pred_label

array([[1]])

In [None]:
model.save_pretrained('Sucidal_classification_t/')

In [None]:

import tensorflow as tf

In [None]:
# model = TFBertForSequenceClassification.from_pretrained("Sucidal_classification_t/")

Some layers from the model checkpoint at Sucidal_classification_t/ were not used when initializing TFBertForSequenceClassification: ['dropout_37']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at Sucidal_classification_t/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [None]:
# q="Having happy days in my life"
# tokenizer = Tokenizer('vocab.json')

# single_text = q

# encoded_text = np.array(tokenizer.encode([single_text]))

# decoded_text = tokenizer.decode(encoded_text)

# print(decoded_text)
# print(encoded_text)

# input_ids = encoded_text
# attention_mask = np.ones_like(input_ids)
# inputs = {'input_ids': input_ids, 'attention_mask': attention_mask}

# pred = model(inputs)

# logits = pred.logits

# prob = tf.nn.sigmoid(logits).numpy()
# threshold = 0.5
# pred_label = (prob >= threshold).astype(int)
# pred_label

100%|██████████| 1/1 [00:00<00:00, 1810.23it/s]

['Having happy days in my life']
[[   0  831 5003 3454 5405 6494 5963    1    2    2    2    2    2    2
     2    2    2    2    2    2    2    2    2    2    2    2    2    2
     2    2    2    2    2    2    2    2    2    2    2    2    2    2
     2    2    2    2    2    2    2    2    2    2    2    2    2    2
     2    2    2    2    2    2    2    2    2    2    2    2    2    2
     2    2    2    2    2    2    2    2    2    2    2    2    2    2
     2    2    2    2    2    2    2    2    2    2    2    2    2    2
     2    2]]





array([[0]])