In [None]:
!pip install transformers evaluate

In [None]:
import pandas as pd
from tqdm import tqdm

data = pd.read_csv('/content/all-data.csv', encoding='latin-1', header=None)
data.columns = ["labels", "text"]
data

In [None]:
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
from transformers import DataCollatorWithPadding
import evaluate
import numpy as np
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch

In [None]:
accuracy = evaluate.load("accuracy")

tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained(
    "ProsusAI/finbert", num_labels=3, # id2label=id2label, label2id=label2id
)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
def preprocess_function(examples):
    return tokenizer(examples, truncation=True)

texts = data["text"].to_list()
labels = data["labels"].to_list()
text_train, text_test, labels_train, labels_test = train_test_split(
    texts, labels, test_size=0.25, random_state=42,
)
tokenized_text_train = preprocess_function(text_train)
tokenized_text_test = preprocess_function(text_test)

print("train size:", len(labels_train))
print("test size:", len(labels_test))

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

id2label = {0: "positive", 1: "neutral", 2: "negative"}
label2id = {"positive": 0, "neutral": 1, "negative": 2}

class BertDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels, label2id):
        self.encodings = encodings
        self.labels = [label2id[value] for value in labels]
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = BertDataset(tokenized_text_train, labels_train, label2id)
test_dataset = BertDataset(tokenized_text_test, labels_test, label2id)

In [None]:
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
from transformers import DataCollatorWithPadding
import evaluate
import numpy as np
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch

accuracy = evaluate.load("accuracy")

tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained(
    "ProsusAI/finbert", num_labels=3, # id2label=id2label, label2id=label2id
)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def preprocess_function(examples):
    return tokenizer(examples, truncation=True)

texts = data["text"].to_list()
labels = data["labels"].to_list()
text_train, text_test, labels_train, labels_test = train_test_split(
    texts, labels, test_size=0.25, random_state=42,
)
tokenized_text_train = preprocess_function(text_train)
tokenized_text_test = preprocess_function(text_test)

print("train size:", len(labels_train))
print("test size:", len(labels_test))


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

id2label = {0: "positive", 1: "neutral", 2: "negative"}
label2id = {"positive": 0, "neutral": 1, "negative": 2}

class BertDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels, label2id):
        self.encodings = encodings
        self.labels = [label2id[value] for value in labels]

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = BertDataset(tokenized_text_train, labels_train, label2id)
test_dataset = BertDataset(tokenized_text_test, labels_test, label2id)

In [None]:
training_args = TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
trainer.train()
model = trainer.model

In [None]:
train_preds = []
for text in tqdm(text_train):
    inputs = tokenizer(text, return_tensors="pt")
    logits = model(**inputs)
    predicted_class_id = logits.logits.argmax().item()
    train_preds.append(id2label[predicted_class_id])

test_preds = []
for text in tqdm(text_test):
    inputs = tokenizer(text, return_tensors="pt")
    logits = model(**inputs)
    predicted_class_id = logits.logits.argmax().item()
    test_preds.append(id2label[predicted_class_id])

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

train_conf_mat = confusion_matrix(labels_train, train_preds)
test_conf_mat = confusion_matrix(labels_test, test_preds)

labels = list(label2id.keys())

fig, ax = plt.subplots(1, 2, figsize=(20, 8))

sns.heatmap(train_conf_mat, annot=True, fmt='d',cmap='Spectral', xticklabels=labels, yticklabels=labels, ax=ax[0])
ax[0].set_title("Train Confusion Matrix")
ax[0].set_ylabel("Actual Label")
ax[0].set_xlabel("Predicted Label")

sns.heatmap(test_conf_mat, annot=True, fmt='d',cmap='Spectral', xticklabels=labels, yticklabels=labels, ax=ax[1])
ax[1].set_title("Test Confusion Matrix")
ax[1].set_ylabel("Actual Label")
ax[1].set_xlabel("Predicted Label")

plt.show()

In [None]:
from sklearn.metrics import fbeta_score

train_f2_score = fbeta_score(labels_train, train_preds, beta=2, average='micro')
test_f2_score = fbeta_score(labels_test, test_preds, beta=2, average='micro')

print("Train F2 Score: ", train_f2_score)
print("Test F2 Score: ", test_f2_score)

In [None]:
from sklearn.metrics import classification_report

train_report = classification_report(labels_train, train_preds, target_names=labels, digits=3)
test_report = classification_report(labels_test, test_preds, target_names=labels, digits=3)

print("Train Classification Report: ")
print(train_report)

print("\nTest Classification Report: ")
print(test_report)

In [None]:
!pip install transformers evaluate

In [None]:
import pandas as pd
from tqdm import tqdm

data = pd.read_csv('/content/all-data.csv', encoding='latin-1', header=None)
data.columns = ["labels", "text"]
data

In [None]:
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
from transformers import DataCollatorWithPadding
import evaluate
import numpy as np
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch

accuracy = evaluate.load("accuracy")

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=3, # id2label=id2label, label2id=label2id
)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def preprocess_function(examples):
    return tokenizer(examples, truncation=True)

texts = data["text"].to_list()
labels = data["labels"].to_list()
text_train, text_test, labels_train, labels_test = train_test_split(
    texts, labels, test_size=0.25, random_state=42,
)
tokenized_text_train = preprocess_function(text_train)
tokenized_text_test = preprocess_function(text_test)

print("train size:", len(labels_train))
print("test size:", len(labels_test))


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

id2label = {0: "positive", 1: "neutral", 2: "negative"}
label2id = {"positive": 0, "neutral": 1, "negative": 2}

class BertDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels, label2id):
        self.encodings = encodings
        self.labels = [label2id[value] for value in labels]

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = BertDataset(tokenized_text_train, labels_train, label2id)
test_dataset = BertDataset(tokenized_text_test, labels_test, label2id)

In [None]:
training_args = TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
trainer.train()
model = trainer.model

In [None]:
train_preds = []
for text in tqdm(text_train):
    inputs = tokenizer(text, return_tensors="pt")
    logits = model(**inputs)
    predicted_class_id = logits.logits.argmax().item()
    train_preds.append(id2label[predicted_class_id])

test_preds = []
for text in tqdm(text_test):
    inputs = tokenizer(text, return_tensors="pt")
    logits = model(**inputs)
    predicted_class_id = logits.logits.argmax().item()
    test_preds.append(id2label[predicted_class_id])

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

train_conf_mat = confusion_matrix(labels_train, train_preds)
test_conf_mat = confusion_matrix(labels_test, test_preds)

labels = list(label2id.keys())

fig, ax = plt.subplots(1, 2, figsize=(20, 8))

sns.heatmap(train_conf_mat, annot=True, fmt='d',cmap='Spectral', xticklabels=labels, yticklabels=labels, ax=ax[0])
ax[0].set_title("Train Confusion Matrix")
ax[0].set_ylabel("Actual Label")
ax[0].set_xlabel("Predicted Label")

sns.heatmap(test_conf_mat, annot=True, fmt='d',cmap='Spectral', xticklabels=labels, yticklabels=labels, ax=ax[1])
ax[1].set_title("Test Confusion Matrix")
ax[1].set_ylabel("Actual Label")
ax[1].set_xlabel("Predicted Label")

plt.show()


In [None]:
from sklearn.metrics import fbeta_score

train_f2_score = fbeta_score(labels_train, train_preds, beta=2, average='micro')
test_f2_score = fbeta_score(labels_test, test_preds, beta=2, average='micro')

print("Train F2 Score: ", train_f2_score)
print("Test F2 Score: ", test_f2_score)

In [None]:
from sklearn.metrics import classification_report

train_report = classification_report(labels_train, train_preds, target_names=labels, digits=3)
test_report = classification_report(labels_test, test_preds, target_names=labels, digits=3)

print("Train Classification Report: ")
print(train_report)

print("\nTest Classification Report: ")
print(test_report)

In [None]:
#bert
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
from transformers import DataCollatorWithPadding
import evaluate
import numpy as np
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch

accuracy = evaluate.load("accuracy")

tokenizer = AutoTokenizer.from_pretrained("YituTech/conv-bert-base")
model = AutoModelForSequenceClassification.from_pretrained(
    "YituTech/conv-bert-base", num_labels=3, # id2label=id2label, label2id=label2id
)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def preprocess_function(examples):
    return tokenizer(examples, truncation=True)

texts = data["text"].to_list()
labels = data["labels"].to_list()
text_train, text_test, labels_train, labels_test = train_test_split(
    texts, labels, test_size=0.25, random_state=42,
)
tokenized_text_train = preprocess_function(text_train)
tokenized_text_test = preprocess_function(text_test)

print("train size:", len(labels_train))
print("test size:", len(labels_test))


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

id2label = {0: "positive", 1: "neutral", 2: "negative"}
label2id = {"positive": 0, "neutral": 1, "negative": 2}

class BertDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels, label2id):
        self.encodings = encodings
        self.labels = [label2id[value] for value in labels]

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = BertDataset(tokenized_text_train, labels_train, label2id)
test_dataset = BertDataset(tokenized_text_test, labels_test, label2id)

In [None]:
training_args = TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
trainer.train()
model = trainer.model

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

train_conf_mat = confusion_matrix(labels_train, train_preds)
test_conf_mat = confusion_matrix(labels_test, test_preds)

labels = list(label2id.keys())

fig, ax = plt.subplots(1, 2, figsize=(20, 8))

sns.heatmap(train_conf_mat, annot=True, fmt='d', cmap='Spectral', xticklabels=labels, yticklabels=labels, ax=ax[0])
ax[0].set_title("Train Confusion Matrix")
ax[0].set_ylabel("Actual Label")
ax[0].set_xlabel("Predicted Label")

sns.heatmap(test_conf_mat, annot=True, fmt='d',cmap='Spectral', xticklabels=labels, yticklabels=labels, ax=ax[1])
ax[1].set_title("Test Confusion Matrix")
ax[1].set_ylabel("Actual Label")
ax[1].set_xlabel("Predicted Label")

plt.show()

In [None]:
from sklearn.metrics import fbeta_score

train_f2_score = fbeta_score(labels_train, train_preds, beta=2, average='micro')
test_f2_score = fbeta_score(labels_test, test_preds, beta=2, average='micro')

print("Train F2 Score: ", train_f2_score)
print("Test F2 Score: ", test_f2_score)

In [None]:
from sklearn.metrics import classification_report

train_report = classification_report(labels_train, train_preds, target_names=labels, digits=3)
test_report = classification_report(labels_test, test_preds, target_names=labels, digits=3)

print("Train Classification Report: ")
print(train_report)

print("\nTest Classification Report: ")
print(test_report)