In [4]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import BertTokenizer, BertModel, AutoTokenizer
import torch

In [64]:
tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-base-p1")
model = AutoModel.from_pretrained("indolem/indobert-base-uncased")

('./vocab.txt',)

In [51]:
def chunk_token(tokens, chunksize = 512):    
    # split into chunks of 510 tokens, we also convert to list (default is tuple which is immutable)
    input_id_chunks = list(tokens['input_ids'][0].split(chunksize - 2))
    mask_chunks = list(tokens['attention_mask'][0].split(chunksize - 2))
    
    return input_id_chunks, mask_chunks

def clssep_padding(input_id_chunks, mask_chunks, chunksize = 512):
    # loop through each chunk
    for i in range(len(input_id_chunks)):
    
        # add CLS and SEP tokens to input IDs
        input_id_chunks[i] = torch.cat([
            torch.tensor([101]), input_id_chunks[i], torch.tensor([102])
        ])

        # add attention tokens to attention mask
        mask_chunks[i] = torch.cat([
            torch.tensor([1]), mask_chunks[i], torch.tensor([1])
        ])

        # get required padding length
        pad_len = chunksize - input_id_chunks[i].shape[0]
        # check if tensor length satisfies required chunk size
        if pad_len > 0:
            # if padding length is more than 0, we must add padding
            input_id_chunks[i] = torch.cat([
                input_id_chunks[i], torch.Tensor([0] * pad_len)
            ])
            mask_chunks[i] = torch.cat([
                mask_chunks[i], torch.Tensor([0] * pad_len)
            ])

    return input_id_chunks, mask_chunks
    

def tokenizing(sentences):
    tokens = tokenizer.encode_plus(
        txt, 
        add_special_tokens=False,
        return_tensors='pt'
    )

    input_id_chunks, mask_chunks = chunk_token(tokens)
    id_chunks, mask_chunks= clssep_padding(input_id_chunks, mask_chunks)

    return id_chunks, mask_chunks

tokenizing(txt)


([tensor([  101., 11999.,   102.,     0.,     0.,     0.,     0.,     0.,     0.,
              0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
              0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
              0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
              0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
              0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
              0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
              0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
              0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
              0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
              0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
              0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
              0.

In [5]:
df = pd.read_csv("Dataset/email_spam_indo.csv")

X= df["Pesan"].values
labels = df['Kategori'].values

X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

In [7]:
df=pd.read_csv("Dataset/health.csv",sep=";")[["text","category"]]
X=df["text"].values
labels = df['category'].values

In [3]:
from typing import Any, Dict, List, Callable, Optional, Tuple, Union
import json
import torch
import transformers
import pandas as pd
from transformers import BertModel, BertTokenizer, DistilBertModel, DistilBertTokenizer
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torch import optim, nn
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn import svm
#from sklearn.pipeline import FeatureUnion, Pipeline

from sklearn.utils.multiclass import unique_labels
from sklearn import metrics as sk_metrics

def split_random(train: float, val: float, test: float) -> str:
    if train + val + test != 1.0:
        raise ValueError("train + val + test  must equal 1")
    rand_num = np.random.rand()
    
    if rand_num  <= train:
        return "train"
    elif rand_num <= train + val:
        return "val"
    else:
        return "test"
    
class BertTransformer(BaseEstimator, TransformerMixin):
    def __init__(
        self,
        tokenizer,
        model,
        max_length: int = 60,
        embedding_func: Optional[Callable[[torch.Tensor], torch.Tensor]] = None,
    ):
        self.tokenizer = tokenizer
        self.model = model
        #self.model.eval()
        self.max_length = max_length
        self.embedding_func = embedding_func

        if self.embedding_func is None:
            self.embedding_func = lambda x: x[0][:, 0, :].squeeze()

    def _tokenize(self, text: str) -> Tuple[torch.Tensor, torch.Tensor]:
        # Tokenize the text with the provided tokenizer
        tokenized_text = self.tokenizer.encode_plus(
            text, add_special_tokens=True, max_length=self.max_length,truncation=True
        )["input_ids"]
        
        # padding
        padded_text = tokenized_text + [0]*(self.max_length-len(tokenized_text))

        # Create an attention mask telling BERT to use all words and ignore padded values
        attention_mask = np.where(np.array(padded_text) != 0, 1, 0)

        # bert takes in a batch so we need to unsqueeze the rows
        return (
            torch.tensor(padded_text).unsqueeze(0),
            torch.tensor(attention_mask).unsqueeze(0),
        )

    def _tokenize_and_predict(self, text: str) -> torch.Tensor:
        tokenized, attention_mask = self._tokenize(text)

        embeddings = self.model(tokenized, attention_mask)
        return self.embedding_func(embeddings)

    def transform(self, text: List[str]):
        if isinstance(text, pd.Series):
            text = text.tolist()

        with torch.no_grad():
            return torch.stack([self._tokenize_and_predict(string) for string in text])

    def fit(self, X, y=None):
        """No fitting necessary so we just return ourselves"""
        return self
    
def calculate_classification_metrics(
    y_true: np.array,
    y_pred: np.array,
    average: Optional[str] = None,
    return_df: bool = True,
) -> Union[Dict[str, float], pd.DataFrame]:
    """Computes f1, precision, recall, precision, kappa, accuracy, and support

    Args:
        y_true: The true labels
        y_pred: The predicted labels
        average: How to average multiclass results

    Returns:
        Either a dataframe of the performance metrics or a single dictionary
    """
    labels = unique_labels(y_true, y_pred)

    # get results
    precision, recall, f_score, support = sk_metrics.precision_recall_fscore_support(
        y_true, y_pred, labels=labels, average=average
    )

    kappa = sk_metrics.cohen_kappa_score(y_true, y_pred, labels=labels)
    accuracy = sk_metrics.accuracy_score(y_true, y_pred)

    # create a pandas DataFrame
    if return_df:
        results = pd.DataFrame(
            {
                "class": labels,
                "f_score": f_score,
                "precision": precision,
                "recall": recall,
                "support": support,
                "kappa": kappa,
                "accuracy": accuracy,
            }
        )
    else:
        results = {
            "f1": f_score,
            "precision": precision,
            "recall": recall,
            "kappa": kappa,
            "accuracy": accuracy,
        }

    return results

In [31]:
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier


#classifier = svm.LinearSVC(C=1.0, class_weight="balanced")
classifier = RandomForestClassifier(n_estimators=100, random_state=42)

dbt = BertTransformer(DistilBertTokenizer.from_pretrained("distilbert-base-uncased"),
                      DistilBertModel.from_pretrained("distilbert-base-uncased"),
                      embedding_func=lambda x: x[0][:, 0, :].squeeze())

In [73]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
y_train_ = encoder.fit_transform(y_train)
y_test_ = encoder.transform(y_test)
y_train_

array([0, 0, 1, ..., 1, 1, 1])

In [34]:
model = Pipeline(
    [
        ("vectorizer", dbt),
        ("classifier", classifier),
    ]
)

model.fit(X_train, y_train)

In [36]:
preds = model.predict(X_test)
calculate_classification_metrics(preds, y_test)

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,class,f_score,precision,recall,support,kappa,accuracy
0,Masalah Akses dan Keamanan Data,0.0,0.0,0.0,0,0.489362,0.625
1,Masalah Pengelolaan Informasi Vaksinasi,0.666667,0.5,1.0,2,0.489362,0.625
2,Masalah Terkait Check-in dan Riwayat Kunjungan,0.8,1.0,0.666667,3,0.489362,0.625
3,Masalah Terkait Pembelian Obat dan Check-in Lo...,0.5,1.0,0.333333,3,0.489362,0.625


In [91]:
preds = model.predict(X_test)
calculate_classification_metrics(preds, y_test)

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,class,f_score,precision,recall,support,kappa,accuracy
0,Masalah Akses dan Keamanan Data,0.0,0.0,0.0,0,0.44186,0.625
1,Masalah Pengelolaan Informasi Vaksinasi,0.571429,0.5,0.666667,3,0.44186,0.625
2,Masalah Terkait Check-in dan Riwayat Kunjungan,0.666667,1.0,0.5,4,0.44186,0.625
3,Masalah Terkait Pembelian Obat dan Check-in Lo...,1.0,1.0,1.0,1,0.44186,0.625


In [92]:
y_test

array(['Masalah Pengelolaan Informasi Vaksinasi',
       'Masalah Pengelolaan Informasi Vaksinasi',
       'Masalah Pengelolaan Informasi Vaksinasi',
       'Masalah Terkait Check-in dan Riwayat Kunjungan',
       'Masalah Akses dan Keamanan Data',
       'Masalah Pengelolaan Informasi Vaksinasi',
       'Masalah Terkait Pembelian Obat dan Check-in Lokasi',
       'Masalah Terkait Check-in dan Riwayat Kunjungan'], dtype=object)

In [87]:
preds

array(['Masalah Terkait Check-in dan Riwayat Kunjungan',
       'Masalah Terkait Check-in dan Riwayat Kunjungan',
       'Masalah Pengelolaan Informasi Vaksinasi',
       'Masalah Terkait Check-in dan Riwayat Kunjungan',
       'Masalah Akses dan Keamanan Data',
       'Masalah Pengelolaan Informasi Vaksinasi',
       'Masalah Terkait Pembelian Obat dan Check-in Lokasi',
       'Masalah Terkait Check-in dan Riwayat Kunjungan'], dtype=object)

In [None]:
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)

In [80]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")
model = AutoModel.from_pretrained("indobenchmark/indobert-base-p1")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [119]:
import re

def remove_selected_punctuation(text):
    # Match any punctuation except ! and ?
    return re.sub(r'[^\w\s!?]', '', text)

sample_text = "Hello, world! How are you doing today? Yes, today!"
clean_text = remove_selected_punctuation(sample_text)
print(clean_text)

def remove_html_tags(text):
    # Define a regular expression pattern for HTML tags
    clean_text = re.sub(r'<.*?>', '', text)
    return clean_text

# Example usage
sample_html = "<html><head><title>Test</title></head><body><p>This is a <b>bold</b> paragraph.</p></body></html>"
clean_text = remove_html_tags(sample_html)
print(clean_text)


Hello world! How are you doing today? Yes today!
TestThis is a bold paragraph.


In [144]:
import re

class DataCleaning:
    def __init__(self,text):
        self.text = text

    def low_cast_data(self):
        self.text = self.text.lower()

    def remove_punc(self):
        punc = r'[#$%&\'()*+,\-./:;<=>@\[\\\]^_`{|}~]'
        self.text = re.sub(punc, '', self.text)

    def extra_space(self):
        self.text = " ".join(self.text.split())

    def remove_html_tags(self):
        self.text = re.sub(r'<.*?>', '', self.text)

    def remove_urls(self):
        url_pattern = r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+"
        self.text = re.sub(url_pattern, '', self.text)
    
    def clean(self):
        self.low_cast_data()
        self.remove_html_tags()
        self.remove_punc()
        self.extra_space()
        self.remove_urls()

In [153]:
test_1="""
    Elemen pentingnya adalah 
    -----
    Dear Mba CS Name,

    Berikut sy kirimkan PCP untuk TSA Solo.
    Tolong dibantu upload.
    Thank you.
    ------

    dan 

    ----
    Selamat Sore Bu Sales Name

    Terlampir kami kirimkan PCP bulan April 2024, mohon koreksinya.
    Terimakasih.


    Salam,

    CS Name
    ------
"""

test_2="""
<p><strong>Dear Mba Staff Name, : </strong></p><p><strong>Brand Name menginformasikan mengenai email tiket dengan detail sbb : </strong></p><table><tbody><tr><td><strong>Ticket Id</strong></td><td>:</td><td>NM20219999-999</td></tr><tr><td><strong>Priority</strong></td><td>:</td><td>NORMAL</td></tr><tr><td><strong>Status</strong></td><td>:</td><td>PENDING</td></tr><tr><td><strong>Inbound</strong></td><td>:</td><td style="width: 70%; word-wrap:break-word ;" >Bpk. Customer Name, end user area Jember, Jawa Tengah dgn No Hp 08123456789, <BR>melalui DM Instagram  Pak Customer Name bertanya sbb :<BR>  min mau tanya klo Cat Brand 2T itu warna isinya yg asli apa ya? Saya beli Brand di 3 toko yang berbeda, toko ke 1 dan ke 2 warna putih & Toko  ke 3  kok warnannya kuning ya ? #Mohon infonya. <BR>Foto terlampir, akan kami kirimkan by email.<BR>.<BR> Terima kasih.<BR></td></tr><tr><td><strong>Category</strong></td><td>:</td><td>MEMBER</td></tr><tr><td><strong>Type</strong></td><td>:</td><td>NON MEMBER</td></tr><tr><td><strong>View Detail</strong></td><td>:</td><td><a href="brand-name.co.id?paramId=NM20219999-999">View Inbound Detail</a></td></tr></tbody></table><p>&nbsp;</p><p>&nbsp;</p><p>Kami tunggu informasinya, terimakasih atas perhatiannya.</p><p>&nbsp;</p><p>Salam Hangat,</p><p>Brand Name</p>
"""

test_3="""
Dear tim,

Thank you.

On Mon, Nov 15, 2021 at 9:39 AM <client@email.id> wrote:

> Dear Ibu Client Name,
>
>
>
> *Terlampir kami sampaikan Logbook Case dari Medsos Per 12 November 2021,
> Mohon dapat diterima dengan baik.*
>
>
>
> Terima kasih,
>
>
>
>
> Salam Sukses
>
>
>
>
>
> From: client@email.id <client@email.id>
> Sent: 08 November 2023 11:18
> To: 'Client Name' <client@name.biz>
> Cc: 'CC Name' <cc@name.biz>
> Subject: [CLIENT] Customer Services 05 November 2023
>
>
>
> Dear Ibu Client Name,
>
>
>
> *Terlampir kami sampaikan Logbook Case dari Medsos Per 05 November 2021,
> Mohon dapat diterima dengan baik.*
>
>
>
> Terima kasih,
>
>
>
>
"""

In [154]:
coba = DataCleaning(test_3)
coba.text

"\nDear tim,\n\nThank you.\n\nOn Mon, Nov 15, 2021 at 9:39 AM <client@email.id> wrote:\n\n> Dear Ibu Client Name,\n>\n>\n>\n> *Terlampir kami sampaikan Logbook Case dari Medsos Per 12 November 2021,\n> Mohon dapat diterima dengan baik.*\n>\n>\n>\n> Terima kasih,\n>\n>\n>\n>\n> Salam Sukses\n>\n>\n>\n>\n>\n> From: client@email.id <client@email.id>\n> Sent: 08 November 2023 11:18\n> To: 'Client Name' <client@name.biz>\n> Cc: 'CC Name' <cc@name.biz>\n> Subject: [CLIENT] Customer Services 05 November 2023\n>\n>\n>\n> Dear Ibu Client Name,\n>\n>\n>\n> *Terlampir kami sampaikan Logbook Case dari Medsos Per 05 November 2021,\n> Mohon dapat diterima dengan baik.*\n>\n>\n>\n> Terima kasih,\n>\n>\n>\n>\n"

In [155]:
coba.clean()
coba.text

'dear tim thank you on mon nov 15 2021 at 939 am wrote dear ibu client name terlampir kami sampaikan logbook case dari medsos per 12 november 2021 mohon dapat diterima dengan baik terima kasih salam sukses from clientemailid sent 08 november 2023 1118 to client name cc cc name subject client customer services 05 november 2023 dear ibu client name terlampir kami sampaikan logbook case dari medsos per 05 november 2021 mohon dapat diterima dengan baik terima kasih'

In [82]:
#tokenized_train = tokenizer(X_train.tolist(), padding = True, truncation = True, return_tensors="pt")
#tokenized_val = tokenizer(X_test.tolist() , padding = True, truncation = True,  return_tensors="pt").to(device)
#
##move on device (GPU)
#tokenized_train = {k:torch.tensor(v).to(device) for k,v in tokenized_train.items()}
#tokenized_val = {k:torch.tensor(v).to(device) for k,v in tokenized_val.items()}

  tokenized_train = {k:torch.tensor(v).to(device) for k,v in tokenized_train.items()}
  tokenized_val = {k:torch.tensor(v).to(device) for k,v in tokenized_val.items()}


In [99]:
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

def batch_data(tokenizer, texts,max_length=512, batch_size=100):
    # Tokenize the text data
    tokenized = tokenizer(texts, max_length=max_length,padding=True, truncation=True, return_tensors="pt")
    
    # Create a dataset from tensors
    dataset = TensorDataset(tokenized['input_ids'], tokenized['attention_mask'], tokenized['token_type_ids'])
    
    # Create a DataLoader
    return DataLoader(dataset, batch_size=batch_size, shuffle=False)

# Create data loaders
train_loader = batch_data(tokenizer, X_train.tolist())
val_loader = batch_data(tokenizer, X_test.tolist())

# Processing in batches
def process_batches(data_loader, model, device):
    model.eval()
    outputs = []
    total_batches = len(data_loader)
    processed_batches = 0  # Counter to track processed batches
    with torch.no_grad():
        for input_ids, attention_mask, token_type_ids in tqdm(data_loader, desc="Processing batches"):
            inputs = {
                'input_ids': input_ids,
                'attention_mask': attention_mask,
                'token_type_ids': token_type_ids
            }
            output = model(**inputs)
            # Accessing the last hidden state and moving it to CPU
            outputs.append(output.last_hidden_state.cpu())
            
            remaining_batches = total_batches - processed_batches
            print(f"Processed {processed_batches}/{total_batches} batches, {remaining_batches} remaining")
    # Concatenating the results across batches
    return torch.cat(outputs, dim=0)

# Process training and validation data
hidden_train = process_batches(train_loader, model, device)
hidden_val = process_batches(val_loader, model, device)


Processing batches:   5%|▍         | 1/22 [01:18<27:32, 78.67s/it]

Processed 0/22 batches, 22 remaining


Processing batches:   9%|▉         | 2/22 [02:44<27:38, 82.93s/it]

Processed 0/22 batches, 22 remaining


Processing batches:  14%|█▎        | 3/22 [03:54<24:23, 77.03s/it]

Processed 0/22 batches, 22 remaining


Processing batches:  18%|█▊        | 4/22 [05:02<22:04, 73.59s/it]

Processed 0/22 batches, 22 remaining


Processing batches:  23%|██▎       | 5/22 [06:12<20:25, 72.07s/it]

Processed 0/22 batches, 22 remaining


Processing batches:  27%|██▋       | 6/22 [07:25<19:21, 72.58s/it]

Processed 0/22 batches, 22 remaining


Processing batches:  32%|███▏      | 7/22 [08:34<17:47, 71.16s/it]

Processed 0/22 batches, 22 remaining


Processing batches:  36%|███▋      | 8/22 [09:44<16:31, 70.83s/it]

Processed 0/22 batches, 22 remaining


Processing batches:  41%|████      | 9/22 [10:52<15:11, 70.12s/it]

Processed 0/22 batches, 22 remaining


Processing batches:  45%|████▌     | 10/22 [12:03<14:04, 70.41s/it]

Processed 0/22 batches, 22 remaining


Processing batches:  50%|█████     | 11/22 [13:12<12:48, 69.89s/it]

Processed 0/22 batches, 22 remaining


Processing batches:  55%|█████▍    | 12/22 [14:25<11:47, 70.75s/it]

Processed 0/22 batches, 22 remaining


Processing batches:  59%|█████▉    | 13/22 [15:43<10:56, 72.91s/it]

Processed 0/22 batches, 22 remaining


Processing batches:  64%|██████▎   | 14/22 [16:51<09:32, 71.53s/it]

Processed 0/22 batches, 22 remaining


Processing batches:  68%|██████▊   | 15/22 [18:05<08:26, 72.31s/it]

Processed 0/22 batches, 22 remaining


Processing batches:  73%|███████▎  | 16/22 [19:24<07:25, 74.17s/it]

Processed 0/22 batches, 22 remaining


Processing batches:  77%|███████▋  | 17/22 [20:47<06:24, 77.00s/it]

Processed 0/22 batches, 22 remaining


Processing batches:  82%|████████▏ | 18/22 [22:06<05:10, 77.66s/it]

Processed 0/22 batches, 22 remaining


Processing batches:  86%|████████▋ | 19/22 [23:25<03:54, 78.08s/it]

Processed 0/22 batches, 22 remaining


Processing batches:  91%|█████████ | 20/22 [24:48<02:38, 79.47s/it]

Processed 0/22 batches, 22 remaining


Processing batches:  95%|█████████▌| 21/22 [26:10<01:20, 80.31s/it]

Processed 0/22 batches, 22 remaining


Processing batches: 100%|██████████| 22/22 [26:18<00:00, 71.76s/it]

Processed 0/22 batches, 22 remaining



Processing batches:  17%|█▋        | 1/6 [01:33<07:48, 93.63s/it]

Processed 0/6 batches, 6 remaining


Processing batches:  33%|███▎      | 2/6 [02:58<05:53, 88.42s/it]

Processed 0/6 batches, 6 remaining


Processing batches:  50%|█████     | 3/6 [04:27<04:26, 88.75s/it]

Processed 0/6 batches, 6 remaining


Processing batches:  67%|██████▋   | 4/6 [05:48<02:50, 85.50s/it]

Processed 0/6 batches, 6 remaining


Processing batches:  83%|████████▎ | 5/6 [07:13<01:25, 85.32s/it]

Processed 0/6 batches, 6 remaining


Processing batches: 100%|██████████| 6/6 [07:37<00:00, 76.28s/it]

Processed 0/6 batches, 6 remaining





In [90]:
with torch.no_grad():
  hidden_train = model(**tokenized_train) #dim : [batch_size(nr_sentences), tokens, emb_dim]
  hidden_val = model(**tokenized_val)

RuntimeError: [enforce fail at alloc_cpu.cpp:114] data. DefaultCPUAllocator: not enough memory: you tried to allocate 44870651904 bytes.

In [101]:
cls_train = hidden_train[:,0,:]
cls_val = hidden_val[:,0,:]

In [114]:
hidden_train

tensor([[[-0.6797,  2.5192,  0.7429,  ..., -0.3515, -0.2583, -0.0498],
         [ 0.0351, -0.2479, -0.2011,  ..., -1.4289,  0.2640, -0.3772],
         [-0.6866,  1.0888,  1.0105,  ...,  0.6559,  1.2218, -1.4541],
         ...,
         [-0.1078,  1.9288,  0.1102,  ..., -0.2184,  0.7896, -0.3703],
         [-0.1581,  1.6967, -0.3239,  ..., -0.1279,  0.5983, -0.3087],
         [-0.3139,  1.4930, -0.3406,  ..., -0.1692,  0.4193, -0.4457]],

        [[-0.1692,  2.2867,  0.7424,  ...,  0.7720,  0.5473,  0.7915],
         [-0.4439,  1.0658,  1.3666,  ...,  0.6507, -0.1035, -0.3604],
         [-1.5802,  1.1258, -0.4194,  ...,  0.1442,  0.3902,  2.5361],
         ...,
         [-1.3393,  1.6026,  1.7633,  ...,  1.6469,  0.5346, -0.3368],
         [-1.1220,  1.4857,  2.2326,  ...,  1.3308,  0.8098, -0.4000],
         [-1.2463,  1.3982,  2.0396,  ...,  1.3764,  0.5045, -0.2983]],

        [[-1.0845,  2.2819,  0.5672,  ..., -0.4812,  0.0136,  0.0623],
         [-1.2719,  1.9638,  0.2880,  ..., -0

In [115]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(cls_train,y_train)
rf.score(cls_val,y_test) 

ValueError: Found array with dim 3. RandomForestClassifier expected <= 2.

In [105]:
from xgboost import XGBClassifier

xg_ = XGBClassifier()
xg_.fit(hidden_train,y_train_)
xg_.score(cls_val,y_test_) 

0.9488636363636364

In [106]:
sv_ = svm.LinearSVC(C=1.0, class_weight="balanced")
sv_.fit(hidden_train,y_train_)
sv_.score(cls_val,y_test_) 



0.9602272727272727

In [None]:
preds = sv_.predict(cls_val)
calculate_classification_metrics(preds, y_test_)

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,class,f_score,precision,recall,support,kappa,accuracy
0,0,0.0,0.0,0.0,0,0.794872,0.875
1,1,0.888889,1.0,0.8,5,0.794872,0.875
2,2,1.0,1.0,1.0,2,0.794872,0.875
3,3,1.0,1.0,1.0,1,0.794872,0.875


In [None]:
preds

array([1, 1, 1, 2, 1, 1, 3, 2])

In [None]:
y_test_

array([1, 1, 1, 2, 0, 1, 3, 2])

In [None]:
preds = rf.predict(cls_val)
calculate_classification_metrics(preds, y_test)

Unnamed: 0,class,f_score,precision,recall,support,kappa,accuracy
0,Masalah Akses dan Keamanan Data,0.0,0.0,0.0,2,0.319149,0.5
1,Masalah Pengelolaan Informasi Vaksinasi,0.333333,0.25,0.5,2,0.319149,0.5
2,Masalah Terkait Check-in dan Riwayat Kunjungan,0.8,1.0,0.666667,3,0.319149,0.5
3,Masalah Terkait Pembelian Obat dan Check-in Lo...,1.0,1.0,1.0,1,0.319149,0.5


In [44]:
#model
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import LinearSVC
from sklearn.preprocessing import  LabelEncoder
from transformers import AutoTokenizer, AutoModel

from src.model import *
from src.preprocessing import *


tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")
model = AutoModel.from_pretrained("indobenchmark/indobert-base-p1")

class Preprocessing:

    def __init__(self):
        df=pd.read_csv("Dataset/health.csv",sep=";")[["text","category"]]
        self.X=df["text"].values
        self.labels = df['category'].values

    #def clean_data(self):
    #    self.X = batch_clean(self.X)
    #    self.labels = batch_clean(self.labels)
    
    def split_data(self):
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.labels, test_size=0.2, random_state=42)

    def tokenizing(self):
        self.train_loader = batch_tokenize_data(tokenizer, self.X_train.tolist())
        self.val_loader = batch_tokenize_data(tokenizer, self.X_test.tolist())

    def encoding(self):
        self.encoder = LabelEncoder()
        self.y_train_ = self.encoder.fit_transform(self.y_train)
        self.y_test_ = self.encoder.transform(self.y_test)
    
    def get_data(self):
        return self.train_loader, self.val_loader, self.y_train_, self.y_test_
    

class Modeling:
    models= {
        "rf": RandomForestClassifier(),
        "xgb": XGBClassifier(),
        "svm": LinearSVC()
    }

    def __init__(self,train_loader,val_loader,y_train_,y_test_,**kwargs):
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.y_train_ = y_train_
        self.y_test_ = y_test_

    def get_hidden_clf(self):
        self.cls_train = get_cls_hidden_state_batches(self.train_loader, model)[:,0,:]
        self.cls_val = get_cls_hidden_state_batches(self.val_loader, model)[:,0,:]
        print(self.cls_train)
        print(self.cls_val)

    def train_models(self):
        for key, model in self.models.items():
            model.fit(self.cls_train,self.y_train_)
    
    def evaluate_models(self):
        preds = {key: model.predict(self.cls_val) for key,model in self.models.items()}
        models_score = compare_model(preds,self.y_test_)
        print(models_score)

    def get_models():
        return self.models

def create_model():
    preprocess = Preprocessing()
    preprocess.split_data()
    preprocess.tokenizing()
    preprocess.encoding()

    modeling = Modeling(**vars(preprocess))
    print(vars(preprocess).keys())
    modeling.get_hidden_clf()
    modeling.train_models()

In [45]:
create_model()

dict_keys(['X', 'labels', 'X_train', 'X_test', 'y_train', 'y_test', 'train_loader', 'val_loader', 'encoder', 'y_train_', 'y_test_'])


Processing batches: 100%|██████████| 1/1 [00:01<00:00,  1.02s/it]
Processing batches: 100%|██████████| 1/1 [00:00<00:00,  3.58it/s]


tensor([[ 0.3553,  1.9224,  0.4365,  ..., -0.1288, -0.8967,  0.0188],
        [ 0.0423,  1.9153,  0.3772,  ..., -0.1001, -0.3121,  0.6772],
        [ 0.1365,  1.5206,  0.8567,  ...,  0.3454, -0.5670,  0.7800],
        ...,
        [ 0.5146,  1.9118,  1.2749,  ...,  0.7630, -1.0019,  0.4778],
        [ 0.3047,  1.3959,  0.7638,  ...,  0.1765, -1.0451,  0.1686],
        [ 1.2877,  1.7488,  0.4171,  ..., -0.7688, -0.6170,  0.2812]])
tensor([[ 0.2775,  1.8993,  1.1300,  ..., -0.0340, -1.1617,  0.3588],
        [ 0.7101,  1.8733,  0.1727,  ..., -0.2653, -0.2282,  0.6452],
        [ 1.0083,  2.2007,  1.0124,  ...,  0.5773, -0.9310,  0.3643],
        ...,
        [ 0.7505,  1.5821,  1.1316,  ...,  0.1237, -0.9783,  0.5782],
        [ 0.3919,  2.0519,  1.3482,  ...,  0.4941, -0.7749,  0.4853],
        [ 0.5701,  1.6949,  0.5827,  ...,  0.5630, -1.1196,  0.1240]])
