In [None]:
!pip --quiet install datasets transformers[torch] evaluate

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import datasets
from datasets import load_dataset
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import glob
from tqdm import tqdm
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
import evaluate
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding, pipeline
from transformers.pipelines.pt_utils import KeyDataset

## Obtaining Data

In [None]:
!unzip '/content/drive/MyDrive/op_spam_v1.4.zip'

Archive:  /content/drive/MyDrive/op_spam_v1.4.zip
replace op_spam_v1.4/LICENSE? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [None]:
df = pd.DataFrame([], columns=['text', 'polarity', 'labels'])

In [None]:
for path in glob.glob('/content/op_spam_v1.4/positive_polarity/truthful_from_Web/*/*.txt'):
  with open(path, 'r') as f:
    df = pd.concat([df, pd.DataFrame({
        'text': f.read(),
        'polarity': 1.,
        'labels': 0.,
        }, index=[0])], axis=0).reset_index(drop=True)

for path in glob.glob('/content/op_spam_v1.4/positive_polarity/deceptive_from_MTurk/*/*.txt'):
  with open(path, 'r') as f:
    df = pd.concat([df, pd.DataFrame({
        'text': f.read(),
        'polarity': 1.,
        'labels': 1.,
        }, index=[0])], axis=0)

for path in glob.glob('/content/op_spam_v1.4/negative_polarity/truthful_from_Web/*/*.txt'):
  with open(path, 'r') as f:
    df = pd.concat([df, pd.DataFrame({
        'text': f.read(),
        'polarity': 0.,
        'labels': 0.,
        }, index=[0])], axis=0)
for path in glob.glob('/content/op_spam_v1.4/negative_polarity/deceptive_from_MTurk/*/*.txt'):
  with open(path, 'r') as f:
    df = pd.concat([df, pd.DataFrame({
        'text': f.read(),
        'polarity': 0.,
        'labels': 1.,
        }, index=[0])], axis=0)

In [None]:
df_train, df_test = train_test_split(df, test_size=0.1, random_state=42)

In [None]:
train_dataset = datasets.Dataset.from_pandas(df_train).cast_column("labels", datasets.ClassLabel(num_classes=2, names=['neg', 'pos']))
test_dataset =  datasets.Dataset.from_pandas(df_test).cast_column("labels", datasets.ClassLabel(num_classes=2, names=['neg', 'pos']))

Casting the dataset:   0%|          | 0/1080 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/120 [00:00<?, ? examples/s]

## Training Model

In [None]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2, id2label=id2label, label2id=label2id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, return_tensors='pt')

In [None]:
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/1080 [00:00<?, ? examples/s]

Map:   0%|          | 0/120 [00:00<?, ? examples/s]

In [None]:
metric = evaluate.load('f1')

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
training_args = TrainingArguments(
    output_dir="scam_classification",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    overwrite_output_dir=True
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,No log,0.289313,0.945055
2,No log,0.414178,0.930481
3,No log,0.37392,0.945055


TrainOutput(global_step=204, training_loss=0.1077884505776798, metrics={'train_runtime': 325.5557, 'train_samples_per_second': 9.952, 'train_steps_per_second': 0.627, 'total_flos': 852479819366400.0, 'train_loss': 0.1077884505776798, 'epoch': 3.0})

In [None]:
trainer.save_model('/content/drive/MyDrive/VarunModel/')

## Inference on Yelp Dataset

In [None]:
yelp_test = load_dataset('yelp_review_full')['test']

Downloading builder script:   0%|          | 0.00/4.41k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.04k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.55k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/196M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/650000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
pipe = pipeline("text-classification", model='/content/drive/MyDrive/VarunModel/', device=0)
all_preds = []
for out in tqdm(pipe(KeyDataset(yelp_test, "text"), batch_size=32, truncation=True, padding='max_length'), total=len(yelp_test)):
  all_preds.append(out)

  0%|          | 33/50000 [00:02<49:44, 16.74it/s]  

[{'label': 'NEGATIVE', 'score': 0.9648427367210388}, {'label': 'NEGATIVE', 'score': 0.8985695838928223}, {'label': 'NEGATIVE', 'score': 0.9871547222137451}, {'label': 'POSITIVE', 'score': 0.9556305408477783}, {'label': 'POSITIVE', 'score': 0.56645667552948}, {'label': 'NEGATIVE', 'score': 0.7224077582359314}, {'label': 'NEGATIVE', 'score': 0.9839990735054016}, {'label': 'NEGATIVE', 'score': 0.9785626530647278}, {'label': 'NEGATIVE', 'score': 0.9183260202407837}, {'label': 'NEGATIVE', 'score': 0.9920582175254822}, {'label': 'NEGATIVE', 'score': 0.8995712399482727}, {'label': 'POSITIVE', 'score': 0.5864787101745605}, {'label': 'NEGATIVE', 'score': 0.9554505944252014}, {'label': 'NEGATIVE', 'score': 0.6479020118713379}, {'label': 'POSITIVE', 'score': 0.9894857406616211}, {'label': 'NEGATIVE', 'score': 0.9368233680725098}, {'label': 'POSITIVE', 'score': 0.913409948348999}, {'label': 'NEGATIVE', 'score': 0.987863302230835}, {'label': 'POSITIVE', 'score': 0.9907298684120178}, {'label': 'POSI

100%|██████████| 50000/50000 [29:24<00:00, 28.34it/s]


In [None]:
while True:
  pass