### Implementation of CBOW 

In [22]:
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup

import torch
import numpy as np
import pandas as pd

from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

In [23]:
from datasets import load_dataset

financial_news = load_dataset('financial_phrasebank', 'sentences_50agree')

In [24]:
financial_news

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 4846
    })
})

In [25]:
data = pd.DataFrame(financial_news['train'])
data.head()

Unnamed: 0,sentence,label
0,"According to Gran , the company has no plans t...",1
1,Technopolis plans to develop in stages an area...,1
2,The international electronic industry company ...,0
3,With the new production plant the company woul...,2
4,According to the company 's updated strategy f...,2


In [48]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

porter_stemmer = PorterStemmer()

def cleaningText(text):
    """
    Text Cleaning:
        - Remove Punctuation
        - Remove Numbers
        - Tokenize Text
        - Stem Text
        - Remove Stopwords
    """ 
    text = re.sub("[^a-zA-Z]", " ", text) # Remove Punctuation
    text = re.sub("[0-9]+", "", text) # Remove Numbers
    text = [ word.lower() for word in word_tokenize(text) if word not in stopwords.words('english') ]
    return " ".join(text)

In [49]:
data['clean_sentence'] = data['sentence'].apply(cleaningText)

In [50]:
data.head()

Unnamed: 0,sentence,label,clean_sentence
0,"According to Gran , the company has no plans t...",1,according gran company plans move production r...
1,Technopolis plans to develop in stages an area...,1,technopolis plans develop stages area less squ...
2,The international electronic industry company ...,0,the international electronic industry company ...
3,With the new production plant the company woul...,2,with new production plant company would increa...
4,According to the company 's updated strategy f...,2,according company updated strategy years baswa...


In [51]:
import pandas as pd
from nltk import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer

In [52]:
bows_counter = CountVectorizer( analyzer = 'word',            # Word level vectorizer
                                lowercase = True,             # Lower case the text
                                ngram_range = (1, 1),         # Create 1 n-grams
                                tokenizer = word_tokenize,   # Use this tokenizer
                                stop_words = 'english',
                                token_pattern = None )     # remove english stopwords

In [53]:
bows_counter.fit(data.clean_sentence)
features = bows_counter.transform(data.clean_sentence).toarray()

In [54]:
features_df = pd.DataFrame(features, columns=bows_counter.get_feature_names_out())
features_df

Unnamed: 0,aaland,aalborg,aalto,aaltonen,aaron,aava,aazhang,ab,abb,abbott,...,zero,zgody,zinc,zip,zloty,zoltan,zone,zoo,zte,zu
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4841,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4842,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4843,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4844,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [56]:
features_df.sum(axis=0).to_frame().T.T.sort_values(by=0, ascending=False)

Unnamed: 0,0
eur,1415
company,851
mn,599
said,545
finnish,524
sales,455
million,441
net,413
profit,410
year,395
