In [None]:
pip install evaluate

In [None]:
pip install -U datasets huggingface-hub

In [None]:
import pandas as pd

In [None]:
train = pd.read_csv("/kaggle/input/spam-e/dataset/train.csv")
train

In [None]:
test = pd.read_csv("/kaggle/input/spam-e/dataset/test.csv")
dev = pd.read_csv("/kaggle/input/spam-e/dataset/dev.csv")

In [None]:
train["combine_label"] = train["Label"].apply(lambda x:str(x))+"_" + train["SpamLabel"].apply(lambda x:str(x))
dev["combine_label"] = dev["Label"].apply(lambda x:str(x))+"_" + dev["SpamLabel"].apply(lambda x:str(x))
test["combine_label"] = test["Label"].apply(lambda x:str(x))+"_" + test["SpamLabel"].apply(lambda x:str(x))

In [None]:
train.head()

In [None]:
train["combine_label"].value_counts()

In [None]:
label2id = {'0_0':0, '1_1':1, '1_2':2, '1_3':3}

In [None]:
train["id"] = train["combine_label"].apply(lambda x: label2id[x])
test["id"] = test["combine_label"].apply(lambda x: label2id[x])
dev["id"] = dev["combine_label"].apply(lambda x: label2id[x])

In [None]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers  import TrainingArguments, Trainer
import pandas as pd
import torch
import random as rd
from  transformers import TrainerCallback
from transformers import AutoModel, AutoModelForSequenceClassification , LongformerConfig

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

In [None]:
tokenizer.push_to_hub("vietdata/vi-spam-detection")

In [None]:
max_size = 2000#train['id'].value_counts().max()
lst = [train]
for class_index, group in train.groupby('id'):
    lst.append(group.sample(max_size-len(group) if max_size >= len(group) else 0, replace=True))
frame_new = pd.concat(lst)
train_ = train
train = frame_new

In [None]:
train

In [None]:
class dataset(Dataset):
    
    def __init__(self, df, tok, split="train"):
        self.df = df
        self.tok = tok
        self.split = split
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        text = self.df["Comment"].iloc[idx]
        label = self.df["id"].iloc[idx]
        return {"labels": label, "input_ids": self.tok(text).input_ids[:512], "text":text}
    
    def collate_fn(batch):
        def get_batch(batch):
            data = {"labels":[], "input_ids": []}
            for i in batch:
                data["labels"].append(i["labels"])
                data["input_ids"].append(i["input_ids"])
            return data
        
        batch = get_batch(batch)
        max_length = max(len(i) for i in batch["input_ids"])
        batch["attention_mask"] = torch.tensor([[1]*len(i) + [0]*(max_length-len(i)) for i in batch["input_ids"]])
        batch["input_ids"] = torch.tensor([i + [0]*(max_length-len(i)) for i in batch["input_ids"]])
        batch["labels"] = torch.tensor(batch["labels"])
        return batch

train_dataset = dataset(train, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

test_dataset = dataset(pd.concat([test, dev]), tokenizer, "test")
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

In [None]:
import evaluate
import numpy as np

metric = evaluate.load("f1")
def compute_metrics(eval_preds):
    metric = evaluate.load("f1")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average ="macro")

In [None]:
import wandb 
wandb.login(key="feddbb233fa551ded959c531ae11e253c2bca5bf")

In [None]:
from transformers import AutoModel, AutoModelForSequenceClassification , LongformerConfig

model = AutoModelForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels=4)#.to("cuda")
model.config.label2id = {'clean':0, 'fake_review_spam':1, 'branch_only_spam':2, 'non_review_spam':3}
model.config.id2label = {v:k for k,v in model.config.label2id.items()}

In [None]:
for idx, (name, param) in enumerate(model.named_parameters()):
    if "classifier" in name:
        continue
    for i in [0,2,4,6,8,10]:
        if str(i) in name:
            param.requires_grad = False
            break
    if "Norm" in name:
        param.requires_grad = False

In [None]:
training_args = TrainingArguments(
    output_dir = 'vietdata/vi-spam-detection',
    evaluation_strategy = "steps",
    learning_rate=2e-5,
    num_train_epochs = 10,
    fp16=True,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    weight_decay = 0.01,
    save_total_limit = 1,
    load_best_model_at_end = True,
    save_strategy = "steps",
    eval_steps = 5000,
    save_steps = 5000,    
    logging_steps=2,
    #push_to_hub=True,
)

trainer = Trainer(
    model= model,                       
    args=training_args,                  
    train_dataset=train_dataset,        
    eval_dataset=test_dataset,   
    data_collator=dataset.collate_fn,
    compute_metrics=compute_metrics,
    #callbacks=[MyCallback]
)

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
trainer.train()

In [None]:
trainer.push_to_hub()

In [None]:
"""
This example computes the score between a query and all possible
sentences in a corpus using a Cross-Encoder for semantic textual similarity (STS).
It output then the most similar sentences for the given query.
"""
from sentence_transformers.cross_encoder import CrossEncoder
import numpy as np

# Pre-trained cross encoder
model = CrossEncoder('vietdata/cross_tech_sbert')

# We want to compute the similarity between the query sentence
query = 'A man is eating pasta.'

# With all sentences in the corpus
corpus = ['A man is eating food.',
          'A man is eating a piece of bread.',
          'The girl is carrying a baby.',
          'A man is riding a horse.',
          'A woman is playing violin.',
          'Two men pushed carts through the woods.',
          'A man is riding a white horse on an enclosed ground.',
          'A monkey is playing drums.',
          'A cheetah is running behind its prey.'
          ]

# So we create the respective sentence combinations
sentence_combinations = [[query, corpus_sentence] for corpus_sentence in corpus]

# Compute the similarity scores for these combinations
similarity_scores = model.predict(sentence_combinations)

# Sort the scores in decreasing order
sim_scores_argsort = reversed(np.argsort(similarity_scores))

# Print the scores
print("Query:", query)
for idx in sim_scores_argsort:
    print("{:.2f}\t{}".format(similarity_scores[idx], corpus[idx]))

In [1]:
from transformers import pipeline

classifier = pipeline(model="vietdata/vi-spam-detection")
# classifier("This movie is disgustingly good !")

# classifier("Director tried too much.")



Downloading config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

In [1]:
from transformers import AutoModel, AutoModelForSequenceClassification, AutoTokenizer
import torch

In [2]:
model = AutoModelForSequenceClassification.from_pretrained("vietdata/vi-spam-detection").to("cuda")
tokenizer = AutoTokenizer.from_pretrained("vietdata/vi-spam-detection")

Downloading config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

In [3]:
model.eval()
print()




In [4]:
model.config.id2label

{0: 'clean',
 1: 'fake_review_spam',
 2: 'branch_only_spam',
 3: 'non_review_spam'}

In [5]:
def predict_batch(batch):
    results = tokenizer(batch, max_length=512, truncation=True, padding=True, return_tensors="pt")
    for k in results:
        results[k] = results[k].to("cuda")
    labels = model(**results)
    labels = torch.argmax(labels.logits, dim=1)
    labels = [model.config.id2label[int(i)] for i in labels]
    return labels

In [11]:
import pandas as pd

df = pd.read_csv("/kaggle/input/ch-play-review/spell_corrected_data.csv")

In [12]:
from tqdm import tqdm 
predictions = []
for i in tqdm(range(len(df)//16+1)):
    texts = df["spell_corrected_content"][i*16:(i+1)*16].values.tolist()
    if len(texts) == 0:
        break
    predictions.extend(predict_batch(texts))

100%|██████████| 5750/5750 [01:30<00:00, 63.39it/s]


In [13]:
df["spam"] = predictions

In [14]:
df[df["spam"] == "clean"].to_csv("clean_data_v4.csv")

In [15]:
df[df["spam"] == "clean"]

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,content,score,thumbsUpCount,Application,spell_corrected_content,spam
0,0,0,không làm được,1,1,Messenger,không làm được,clean
1,1,1,đăng xuất ra vô lại bị lỗi nhìu,1,1,Messenger,đăng xuất ra vô lại bị lỗi nhiều,clean
3,3,3,1 sao vì chặn mà vẫn gỡ được,1,0,Messenger,1 sào vì chăn mà vẫn gỡ được,clean
4,4,4,sản năm sao luân,5,1,Messenger,sạn năm sao luận,clean
6,6,6,mất ft hoài,1,1,Messenger,mắt ft hoài,clean
...,...,...,...,...,...,...,...,...
91993,91993,91993,ứng dụng tốt theo dõi và làm quen được nhiều b...,4,0,Instagram,ứng dụng tốt theo dõi và làm quen được nhiều b...,clean
91994,91994,91994,cập nhật không được ạ,4,0,Instagram,cập nhật không được ạ,clean
91996,91996,91996,chụp đẹp nhưng chưa có tính năng giây nha mong...,4,0,Instagram,chụp đẹp nhưng chưa có tính năng giấy nhà mong...,clean
91997,91997,91997,điện video bị lỗi,1,0,Instagram,điện video bị lỗi,clean
