In [1]:
import pandas as pd

from transformers import BertTokenizer, BertForSequenceClassification
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
from transformers import Trainer, TrainingArguments

import torch
from torch.utils.data import DataLoader, Dataset
from torch.nn import functional as F
from torch.nn.functional import softmax

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import os
import shutil
import zipfile

from IPython.display import HTML
from IPython.display import FileLink

2024-04-19 03:42:11.438879: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-19 03:42:11.439006: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-19 03:42:11.588216: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
train_data = pd.read_csv("/kaggle/input/bert-credibility-data/full_data_train.csv")
test_data = pd.read_csv("/kaggle/input/bert-credibility-data/full_data_test.csv")
dev_data = pd.read_csv("/kaggle/input/bert-credibility-data/full_data_dev.csv")
dev_data.head(5)

Unnamed: 0,title,text,label
0,"""""hillary clinton planning trump russian agent...",""""" hillary clinton has been caught in yet anot...",0
1,"""""poor, displaced and anxious in north carolin...","""""kinston, n. c. — at a community college t...",1
2,"""""winning: gallup reports u.s. job creation in...","""""gallup's march job creation index hit a reco...",1
3,"""""paul ryan: 'no evidence' of sessions colludi...","""""house speaker paul ryan pushed back against ...",1
4,"""""trump targets tech's h-1b visa hiring tool""""","""""san francisco - president trump's relationsh...",1


In [3]:
full_df = pd.concat([train_data, dev_data, test_data], axis=0)
full_df.head(5)
full_df.shape[0]

24187

In [4]:
financial_terms = [
    "asset", "liability", "equity", "income", "expense", "revenue", "profit",
    "cash", "debt", "credit", "capital", "tax", "dividend", "bond", "stock", "option", 
    "mutual", "etf", "futures", "index", "yield", "return", "risk", "volatility", 
    "liquidity", "market", "exchange", "broker", "investor", "analyst", "portfolio", 
    "allocation", "diversification", "benchmark", "performance", "price", "value", 
    "growth", "capitalization", "ipo", "m&a", "divestiture", "restructuring", 
    "acquisition", "merger", "derivative", "strike", "expiration", 
    "premium", "hedging", "leverage", "margin", "collateral", "interest", "rate", 
    "federal", "central", "monetary", "fiscal", "inflation", "deflation", "recession", 
    "depression", "expansion", "boom", "gdp", "cpi", "ppi", "unemployment", 
    "labor", "wage", "salary", "compensation", "retirement", "pension", "annuity", 
    "social", "healthcare", "insurance", "underwriting", "claims", "premiums", 
    "deductible", "coverage", "reinsurance", "catastrophe", "underlying", "actuarial", 
    "solvency", "regulation"
]

# Define filtering function
def filter_text(row):
    for word in financial_terms:
        if word in row['title'] or word in row['text']:
            return True
    return False

# Apply filtering function to create mask
mask = full_df.apply(filter_text, axis=1)

# Filter out rows using the mask
filtered_df = full_df[mask]

filtered_df.head(10)

Unnamed: 0,title,text,label
0,"""""hillary abruptly stops at rally, couldn't hi...","""""hillary abruptly stops at rally, couldn't hi...",0
1,"""""royal troon golf club votes to allow female ...","""""london — there are no longer any golf clu...",1
2,"""""scientists shocked as they find melanesians ...","""""hints of an unidentified, extinct human spec...",0
3,"""""the scope of the orlando carnage - the new y...","""""these locations are never random. these targ...",1
4,"""""trump signs resolution overturning obama pla...","""""president donald trump signed a resolution t...",1
5,"""""amazon giving away free budget phones.""""","""""in a new move to pull market share away from...",0
6,"""""september new homes sales rise——-back to 199...","""""september new homes sales rise back to 1992 ...",0
7,"""""the smothers brothers and the birth of tv bu...","""""fifty years ago right about now, two unassum...",1
10,"""""why you should drink carrot juice daily? how...","""""2 cups carrots, roughly chopped 1 tbsp fresh...",0
12,"""""report: susan rice ordered 'spreadsheets' of...","""""president barack obama's national security a...",1


In [5]:
filtered_df.shape[0]

20260

In [7]:
uncredibilitied_stock_data.head(5)

Unnamed: 0,text,published,title,site,-10,-9,-8,-7,-6,-5,...,4,5,6,7,8,9,10,percent_change,change,credibility
0,17 Hours Ago | 02:56 \nEmerging markets soared...,2018-01-03,Emerging markets are set for an even bigger ra...,cnbc.com,,,2680.5,2682.620117,2687.540039,2673.610107,...,,2747.709961,2751.290039,2748.22998,2767.560059,2786.23999,,-0.004012,0.0,0.85
1,Cramer reflects on how Trump's actions are fue...,2018-01-02,Cramer reflects on how Trump's actions are fue...,cnbc.com,,,,2680.5,2682.620117,2687.540039,...,,,2747.709961,2751.290039,2748.22998,2767.560059,2786.23999,-0.006358,0.0,0.85
2,"Published: Jan 2, 2018 5:59 p.m. ET Share \nFe...",2018-01-02,The Wall Street Journal: Peter Thiel’s VC firm...,wsj.com,,,,2680.5,2682.620117,2687.540039,...,,,2747.709961,2751.290039,2748.22998,2767.560059,2786.23999,-0.006358,0.0,0.8
3,By Reuters 8:23 AM EST \nTelevision host Hoda ...,2018-01-02,Hoda Kotb Will Replace Matt Lauer on NBC’s ‘To...,fortune.com,,,,2680.5,2682.620117,2687.540039,...,,,2747.709961,2751.290039,2748.22998,2767.560059,2786.23999,-0.006358,0.0,0.88
4,"January 3, 2018 / 12:54 AM / Updated 14 hours ...",2018-01-03,U.S. judge orders government to release Iraqis...,reuters.com,,,2680.5,2682.620117,2687.540039,2673.610107,...,,2747.709961,2751.290039,2748.22998,2767.560059,2786.23999,,-0.004012,0.0,1.0


In [8]:
train_df, temp_df = train_test_split(filtered_df, test_size=0.3, random_state=42)
dev_df, test_df = train_test_split(filtered_df, test_size=0.33, random_state=42)

In [9]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

train_df["text"] = tokenizer.cls_token + train_df['title'] + tokenizer.sep_token + train_df['text'] + tokenizer.sep_token
train_df = train_df.drop(['title'], axis=1)

dev_df["text"] = tokenizer.cls_token + dev_df['title'] + tokenizer.sep_token + dev_df['text'] + tokenizer.sep_token
dev_df = dev_df.drop(['title'], axis=1)

test_df["text"] = tokenizer.cls_token + test_df['title'] + tokenizer.sep_token + test_df['text'] + tokenizer.sep_token
test_df = test_df.drop(['title'], axis=1)

train_df.head()

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Unnamed: 0,text,label
3646,"[CLS]""""megan fox on 'ninja turtles' and female...",1
15430,"[CLS]""""those fleeing mosul face summary execut...",0
8526,"[CLS]""""raptors escape past, ousting the heat i...",1
19095,"[CLS]""""shock: woman's stolen car returns with ...",0
10618,"[CLS]""""the crisis of the european union is irr...",0


In [10]:
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['text']
        label = self.data.iloc[idx]['label']

        # Tokenize text
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].squeeze(0)
        attention_mask = encoding['attention_mask'].squeeze(0)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': torch.tensor(label, dtype=torch.long)
        }

In [11]:
train_dataset = CustomDataset(train_df, tokenizer, 512)
dev_dataset = CustomDataset(dev_df, tokenizer, 512)
test_dataset = CustomDataset(test_df, tokenizer, 512)

In [12]:
def get_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    
    # Calculate metrics using scikit-learn
    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    f1 = f1_score(labels, preds, average='weighted')

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [13]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

os.environ["WANDB_DISABLED"] = "true"

training_args = TrainingArguments(
    output_dir='/kaggle/working/models',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='/kaggle/working/logs',            # directory for storing
    logging_steps=10,                # log training loss every n steps
    evaluation_strategy="epoch",     # evaluate model at the end of each epoch
    save_strategy="epoch",             # save model checkpoint at the end of each epoch
    save_total_limit=3,              # Limit the total number of saved models
    save_steps=500,
)

credibility_trainer = Trainer(
    model=model,                     # the instantiated 🤗 Transformers model to be trained
    args=training_args,              # training arguments
    train_dataset=train_dataset,     # training dataset
    eval_dataset=dev_dataset,        # evaluation dataset
    tokenizer=tokenizer,             # tokenizer for encoding input data
    compute_metrics=get_metrics
)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [14]:
credibility_trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3313,0.277159,0.915942,0.915901,0.915942,0.915918
2,0.2032,0.246181,0.919699,0.919655,0.919699,0.919672
3,0.1735,0.210715,0.923457,0.92339,0.923457,0.923396


TrainOutput(global_step=2661, training_loss=0.28352810901074155, metrics={'train_runtime': 4533.8515, 'train_samples_per_second': 9.384, 'train_steps_per_second': 0.587, 'total_flos': 5635957943218176.0, 'train_loss': 0.28352810901074155, 'epoch': 3.0})

In [16]:
def zip_and_move_folder(source_folder, zip_name, destination_folder):
    # Ensure source_folder exists
    if not os.path.exists(source_folder):
        print(f"Error: Folder '{source_folder}' not found.")
        return

    # Ensure destination_folder exists
    if not os.path.exists(destination_folder):
        print(f"Error: Destination folder '{destination_folder}' not found.")
        return

    # Ensure zip_name has a .zip extension
    if not zip_name.endswith('.zip'):
        zip_name += '.zip'

    # Zip the source_folder
    zip_path = os.path.join(destination_folder, zip_name)
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in os.walk(source_folder):
            for file in files:
                file_path = os.path.join(root, file)
                zipf.write(file_path, os.path.relpath(file_path, source_folder))

    # Move the zipped folder to destination_folder
    shutil.move(zip_path, os.path.join(destination_folder, zip_name))

    print(f"Folder '{source_folder}' zipped as '{zip_name}' and moved to '{destination_folder}'.")

source_folder = "/kaggle/working/models/checkpoint-1774"
zip_name = "model.zip"
destination_folder = '/kaggle/working'

zip_and_move_folder(source_folder, zip_name, destination_folder)

Folder '/kaggle/working/models/checkpoint-1774' zipped as 'model.zip' and moved to '/kaggle/working'.


In [17]:
test_dataset = CustomDataset(test_df, tokenizer, 512)

results = credibility_trainer.evaluate(eval_dataset=test_dataset)

results_df = pd.DataFrame(results, index=[0])
results_df

Unnamed: 0,eval_loss,eval_accuracy,eval_precision,eval_recall,eval_f1,eval_runtime,eval_samples_per_second,eval_steps_per_second,epoch
0,0.246985,0.919533,0.919479,0.919533,0.919492,304.1941,21.979,0.345,3.0


In [40]:
uncredibilitied_stock_data = pd.read_csv("/kaggle/input/news-and-stock-data/news_and_stocks_with_domain.csv")
uncredibilitied_stock_data = uncredibilitied_stock_data[['title', 'text']]
uncredibilitied_stock_data = uncredibilitied_stock_data.dropna()
uncredibilitied_stock_data.head(5)

Unnamed: 0,title,text
0,Emerging markets are set for an even bigger ra...,17 Hours Ago | 02:56 \nEmerging markets soared...
1,Cramer reflects on how Trump's actions are fue...,Cramer reflects on how Trump's actions are fue...
2,The Wall Street Journal: Peter Thiel’s VC firm...,"Published: Jan 2, 2018 5:59 p.m. ET Share \nFe..."
3,Hoda Kotb Will Replace Matt Lauer on NBC’s ‘To...,By Reuters 8:23 AM EST \nTelevision host Hoda ...
4,U.S. judge orders government to release Iraqis...,"January 3, 2018 / 12:54 AM / Updated 14 hours ..."


In [41]:
stock_data = uncredibilitied_stock_data.copy()
print(stock_data.columns)
stock_data["text"] = stock_data["text"].str.lower()
stock_data["title"] = stock_data['title'].str.lower()

stock_data["text"] = tokenizer.cls_token + stock_data['title'] + tokenizer.sep_token + stock_data['text'] + tokenizer.sep_token
stock_data = stock_data["text"]
stock_data.head(5)

Index(['title', 'text'], dtype='object')


0    [CLS]emerging markets are set for an even bigg...
1    [CLS]cramer reflects on how trump's actions ar...
2    [CLS]the wall street journal: peter thiel’s vc...
3    [CLS]hoda kotb will replace matt lauer on nbc’...
4    [CLS]u.s. judge orders government to release i...
Name: text, dtype: object

In [43]:
data_to_predict = stock_data.tolist()
model = model.to('cuda')

In [44]:
probs = []
batch_size = 4
for i in range(0, len(data_to_predict), batch_size):
    batch_data = data_to_predict[i:i+batch_size]

    # Tokenize batch_data here using your tokenizer
    tokenized_data = tokenizer(batch_data, truncation=True, padding=True, return_tensors="pt")

    with torch.no_grad():
        # Process tokenized_data here
        outputs = model(**tokenized_data.to('cuda'))  # Assuming model is on the appropriate device
        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=1)
        
        probs.append(probabilities)

combined_probabilities = []

# Iterate through probs and concatenate every batch_size tensors
for i in range(0, len(probs), batch_size):
    # Extract a batch of tensors
    batch_tensors = probs[i:i + batch_size]
    
    # Concatenate the batch tensors along dim=0 (assuming they have the same shape)
    concatenated_tensor = torch.cat(batch_tensors, dim=0)
    
    # Append the concatenated tensor to the combined list
    combined_probabilities.append(concatenated_tensor)

class1_probs = []

# Process each batch
for probs_batch in combined_probabilities:
    # Extract probabilities of class 1 (index 1)
    class1_probs_batch = probs_batch[:, 1]  # Assuming class 1 is in the second column
    class1_probs.extend(class1_probs_batch.tolist())  # Convert to list and extend the main list

In [45]:
uncredibilitied_stock_data['credibility'] = class1_probs
uncredibilitied_stock_data.head()

Unnamed: 0,title,text,credibility
0,Emerging markets are set for an even bigger ra...,17 Hours Ago | 02:56 \nEmerging markets soared...,0.004278
1,Cramer reflects on how Trump's actions are fue...,Cramer reflects on how Trump's actions are fue...,0.005587
2,The Wall Street Journal: Peter Thiel’s VC firm...,"Published: Jan 2, 2018 5:59 p.m. ET Share \nFe...",0.003425
3,Hoda Kotb Will Replace Matt Lauer on NBC’s ‘To...,By Reuters 8:23 AM EST \nTelevision host Hoda ...,0.004633
4,U.S. judge orders government to release Iraqis...,"January 3, 2018 / 12:54 AM / Updated 14 hours ...",0.073293


In [46]:
uncredibilitied_stock_data.to_csv("/kaggle/working/credibility_labeled_stock_data.csv", index=False)

In [47]:
sum(class1_probs) / len(class1_probs)

0.07541116943581183