In [22]:
import torch
import torch.nn.functional as F
import torchtext
import time
import random
import pandas as pd

torch.backends.cudnn.deterministic = True

In [23]:
RANDOM_SEED = 123
torch.manual_seed(RANDOM_SEED)

VOCABULARY_SIZE = 20000
LEARNING_RATE = 0.005
BATCH_SIZE = 128
NUM_EPOCHS = 15
DEVICE = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')

EMBEDDING_DIM = 128
HIDDEN_DIM = 256
NUM_CLASSES = 2

In [24]:
df = pd.read_csv('./crawling/cleaned_file.csv')
df

Unnamed: 0,topic,title,source,article_link,article,sentiments
0,Microsoft,The Top 7 Tech Stocks to Buy in March 2024,InvestorPlace,https://investorplace.com/2024/03/the-top-7-te...,Microsoft (MSFT) is still growing at a fast pa...,positive
1,Microsoft,Microsoft-owned LinkedIn makes this 'first-eve...,Times of India,https://timesofindia.indiatimes.com/gadgets-ne...,Microsoft-owned LinkedIn generated $1.7 billio...,positive
2,Microsoft,Microsoft debuts Copilot for finance pros,Computerworld,https://www.computerworld.com/article/3714142/...,Microsoft has launched a Copilot assistant for...,neutral
3,Microsoft,Microsoft's Path to Becoming the Largest Compa...,The Motley Fool,https://www.fool.com/investing/2024/03/06/micr...,Microsoft's Path to Becoming the Largest Compa...,positive
4,Microsoft,Top Analyst Brad Reback Weighs in on Microsoft...,Tipranks,https://www.tipranks.com/news/top-analyst-brad...,Non-deal roadshows (NDRs) allow institutional ...,positive
...,...,...,...,...,...,...
10856,Crowdstrike Holdings,CrowdStrike (CRWD) Q4 Earnings Report Preview:...,Yahoo Finance,https://finance.yahoo.com/news/crowdstrike-crw...,"In this article, CrowdStrike (NASDAQ:CRWD) wil...",positive
10857,Crowdstrike Holdings,CRWD Stock Earnings: CrowdStrike Holdings Beat...,InvestorPlace,https://investorplace.com/earning-results/2024...,CrowdStrike Holdings (NASDAQ: CRWD) just repor...,neutral
10858,Crowdstrike Holdings,"Insider Selling: CrowdStrike Holdings, Inc. (N...",Defense World,https://www.defenseworld.net/2024/03/24/inside...,"CrowdStrike Holdings, Inc. (NASDAQ:CRWD) Presi...",negative
10859,Crowdstrike Holdings,CrowdStrike (NASDAQ:CRWD) Shares Down 0.2% on ...,MarketBeat,https://www.marketbeat.com/instant-alerts/nasd...,"CrowdStrike Holdings, Inc. (NASDAQ:CRWD) share...",negative


In [25]:
columns_drop = ['title','topic', 'source', 'article_link' ]
df.drop(columns=columns_drop, inplace=True)

In [32]:
df.to_csv('twofortraining.csv',index = False)

In [33]:
import torchtext.data as data

TEXT = data.Field(
    tokenize='spacy', # default splits on whitespace
    tokenizer_language='en_core_web_sm'
)

LABEL = data.LabelField(dtype=torch.long)

In [34]:
fields = [('TEXT_COLUMN_NAME', TEXT), ('LABEL_COLUMN_NAME', LABEL)]

dataset = data.TabularDataset(
    path='twofortraining.csv', format='csv',
    skip_header=True, fields=fields)

In [29]:
train_data, test_data = dataset.split(
    split_ratio=[0.8, 0.2],
    random_state=random.seed(RANDOM_SEED))

print(f'Num Train: {len(train_data)}')
print(f'Num Test: {len(test_data)}')

Num Train: 8689
Num Test: 2172


In [30]:
train_data, valid_data = train_data.split(
    split_ratio=[0.85, 0.15],
    random_state=random.seed(RANDOM_SEED))

print(f'Num Train: {len(train_data)}')
print(f'Num Validation: {len(valid_data)}')

Num Train: 7386
Num Validation: 1303


In [31]:
print(vars(train_data.examples[0]))


{'TEXT_COLUMN_NAME': ['Monster', 'Beverage'], 'LABEL_COLUMN_NAME': 'Why Monster Beverage (MNST) is a Top Momentum Stock for the Long-Term'}
