In [2]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
from pylab import rcParams
from matplotlib import rc
import joblib

from transformers import AutoTokenizer, AutoModel
import torch
from torch import nn,optim
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, classification_report

from tqdm.auto import tqdm

ModuleNotFoundError: No module named 'seaborn'

In [None]:
!pip install pytorch_lightning

In [None]:
path_input = 'financial_phrasebank_arabic.csv'
df = pd.read_csv(path_input, lineterminator='\n')

# Remove the extra sentence from each row
df['sentence'] = df['sentence'].apply(lambda x: x.split('ترجمة')[0] if 'ترجمة' in x else x)

df.head()

In [None]:
train, val = train_test_split(df, test_size=0.1, random_state=42)

train.to_csv("train.csv",index=False)
val.to_csv("val.csv",index=False)

In [None]:
class ArabicDataset(Dataset):
    def __init__(self,data,max_len):
        super().__init__()
        self.labels = data["label"].values
        self.texts = data["sentence"].values
        self.max_len = max_len
        model = 'aubmindlab/bert-base-arabertv2'
        self.tokenizer = AutoTokenizer.from_pretrained(model)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self,idx):
        text = " ".join(self.texts[idx].split())
        label = self.labels[idx]
        inputs = self.tokenizer(text,padding='max_length',
                                max_length=self.max_len,truncation=True,return_tensors="pt")
        #input_ids,token_type_ids,attention_mask
        return {
            "inputs":{"input_ids":inputs["input_ids"][0],
                      "token_type_ids":inputs["token_type_ids"][0],
                      "attention_mask":inputs["attention_mask"][0],
                     },
            "labels": torch.tensor(label,dtype=torch.long)
        }

In [None]:
class ArabicDataModule(pl.LightningDataModule):
    def __init__(self,train_path,val_path,batch_size=12,max_len=100):
        super().__init__()
        self.train_path,self.val_path= train_path,val_path
        self.batch_size = batch_size
        self.max_len = max_len


    def setup(self,stage=None):
        train = pd.read_csv(self.train_path)
        val = pd.read_csv(self.val_path)
        self.train_dataset = ArabicDataset(data=train,max_len=self.max_len)
        self.val_dataset = ArabicDataset(data=val,max_len=self.max_len)

    def train_dataloader(self):
        return DataLoader(self.train_dataset,batch_size=self.batch_size,shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.val_dataset,batch_size=self.batch_size,shuffle=False)

    def test_dataloader(self):
        return DataLoader(self.val_dataset,batch_size=self.batch_size,shuffle=False)

In [None]:
n_classes = 3
class ArabicBertModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        model = "aubmindlab/bert-base-arabertv2"
        self.bert_model = AutoModel.from_pretrained(model)
        self.fc = nn.Linear(768, n_classes)

    def forward(self,inputs):
        out = self.bert_model(**inputs)#inputs["input_ids"],inputs["token_type_ids"],inputs["attention_mask"])
        last_hidden_states = out[1]
        out = self.fc(last_hidden_states)
        return out

    def configure_optimizers(self):
        return optim.AdamW(self.parameters(), lr=0.0001)

    def criterion(self,output,target):
        return nn.CrossEntropyLoss()(output,target)

    #TODO: adding metrics
    def training_step(self,batch,batch_idx):
        x,y = batch["inputs"],batch["labels"]
        out = self(x)
        loss = self.criterion(out,y)
        metrics = {"train_loss": loss}
        self.log_dict(metrics)
        return loss

    def validation_step(self,batch,batch_idx):
        x, y = batch["inputs"],batch["labels"]
        out = self(x)
        loss = self.criterion(out,y)
        metrics = {"val_loss": loss}
        self.log_dict(metrics)
        return metrics

In [None]:

dm = ArabicDataModule(train_path="train.csv",
                val_path = "val.csv",
                batch_size=128, max_len=70)

model = ArabicBertModel()
trainer = pl.Trainer(max_epochs=10, default_root_dir='.') #callbacks=[EarlyStopping(monitor="val_f1")]
trainer.fit(model,dm)

In [None]:
torch.save(model, 'arabert_financialnews_sentimentanalysis.pth')

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = torch.load('arabert_financialnews_sentimentanalysis.pth',  map_location=device)
model.to(device)

preds = []
real_values = []

test_dataloader = dm.test_dataloader()

progress_bar = tqdm(range(len(test_dataloader)))

model.eval()
for batch in test_dataloader:
    x,y = batch["inputs"],batch["labels"]
    inp = {k: v.to(device) for k, v in x.items()}

    with torch.no_grad():
        outputs = model(inp)

    predictions = torch.argmax(outputs, dim=1)

    preds.extend(predictions)
    real_values.extend(y)

    progress_bar.update()

preds = torch.stack(preds).cpu()
real_values = torch.stack(real_values).cpu()
print(classification_report(real_values, preds, target_names=[0, 1, 2]))

In [4]:
#read csv file
df = pd.read_csv('data/news_data.csv')
labels = pd.read_csv('data/news_data_predicted_labels.csv')

In [9]:
labels.head()

Unnamed: 0,Predicted_Labels
0,1
1,2
2,2
3,1
4,1


In [10]:
# combine columns of df and labels into one dataframe
df['label'] = labels['Predicted_Labels']

df.head()

Unnamed: 0,Title,Date,Tag,label
0,"خسائر ""معدنية"" ترتفع 7.8% إلى 7.56 مليون ريال....",8 أغسطس,نتائج أعمال الشركات,1
1,"""أنابيب"" تفوز بعقد من ""أرامكو السعودية"" بقيمة ...",8 أغسطس,أخبار الشركة,2
2,"""البحري"" تمدد مذكرة مع ""مجموعة عجلان"" لبحث إنش...",8 أغسطس,استثمارات الشركات,2
3,"لجنة المنازعات الزكوية تلزم ""جبل عمر"" بتسديد 3...",8 أغسطس,ضرائب,1
4,"""أليانز"" تتحول للربحية بـ 13.16 مليون ريال خلا...",8 أغسطس,نتائج أعمال الشركات,1


In [11]:
#save df in csv file "news_data.csv"
df.to_csv('data/news_data.csv', index=False)