<a href="https://colab.research.google.com/github/Trung0Minh/AIO2023-MODULE-6/blob/main/rnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sentiment Analysis for Financial News

In [42]:
import torch
import torch.nn as nn

seed = 1
torch.manual_seed(seed)

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk
!pip install unidecode
import unidecode

nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

Collecting unidecode
  Downloading Unidecode-1.3.8-py3-none-any.whl.metadata (13 kB)
Downloading Unidecode-1.3.8-py3-none-any.whl (235 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.5/235.5 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.3.8


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [43]:
dataset_path = 'all-data.csv'
headers = ['sentiment', 'content']
df = pd.read_csv(
    dataset_path,
    names=headers,
    encoding='ISO-8859-1'
)
df

Unnamed: 0,sentiment,content
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...
...,...,...
4841,negative,LONDON MarketWatch -- Share prices ended lower...
4842,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4843,negative,Operating profit fell to EUR 35.4 mn from EUR ...
4844,negative,Net sales of the Paper segment decreased to EU...


In [44]:
classes = {class_name: idx for idx, class_name in enumerate(df['sentiment'].unique())}
df['sentiment'] = df['sentiment'] .apply(lambda x: classes[x])
df

Unnamed: 0,sentiment,content
0,0,"According to Gran , the company has no plans t..."
1,0,Technopolis plans to develop in stages an area...
2,1,The international electronic industry company ...
3,2,With the new production plant the company woul...
4,2,According to the company 's updated strategy f...
...,...,...
4841,1,LONDON MarketWatch -- Share prices ended lower...
4842,0,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4843,1,Operating profit fell to EUR 35.4 mn from EUR ...
4844,1,Net sales of the Paper segment decreased to EU...


In [45]:
classes

{'neutral': 0, 'negative': 1, 'positive': 2}

In [46]:
english_stopwords = stopwords.words('english')
stemmer = PorterStemmer()

In [47]:
len(english_stopwords)

198

In [48]:
def text_normalize(text):
    text = text.lower()
    text = unidecode.unidecode(text)
    text = text.strip()
    text = re.sub(r'[^\w\s]', '', text)
    text = ' '.join([word for word in text.split() if word not in english_stopwords])
    text = ' '.join([stemmer.stem(word) for word in text.split()])
    return text

In [49]:
df['content'] = df['content'].apply(text_normalize)
df

Unnamed: 0,sentiment,content
0,0,accord gran compani plan move product russia a...
1,0,technopoli plan develop stage area less 100000...
2,1,intern electron industri compani elcoteq laid ...
3,2,new product plant compani would increas capac ...
4,2,accord compani updat strategi year 20092012 ba...
...,...,...
4841,1,london marketwatch share price end lower londo...
4842,0,rinkuskiai beer sale fell 65 per cent 416 mill...
4843,1,oper profit fell eur 354 mn eur 688 mn 2007 in...
4844,1,net sale paper segment decreas eur 2216 mn sec...


In [66]:
vocab = []
for sentence in df['content'].tolist():
    tokens = sentence.split()
    for token in tokens:
        if token not in vocab:
            vocab.append(token)

vocab.append('UNK')
vocab.append('PAD')
word_to_idx = {word: idx for idx, word in enumerate(vocab)}
vocab_size = len(vocab)

int

In [68]:
vocab[-2:]

['UNK', 'PAD']

In [51]:
word_to_idx

{'accord': 0,
 'gran': 1,
 'compani': 2,
 'plan': 3,
 'move': 4,
 'product': 5,
 'russia': 6,
 'although': 7,
 'grow': 8,
 'technopoli': 9,
 'develop': 10,
 'stage': 11,
 'area': 12,
 'less': 13,
 '100000': 14,
 'squar': 15,
 'meter': 16,
 'order': 17,
 'host': 18,
 'work': 19,
 'comput': 20,
 'technolog': 21,
 'telecommun': 22,
 'statement': 23,
 'said': 24,
 'intern': 25,
 'electron': 26,
 'industri': 27,
 'elcoteq': 28,
 'laid': 29,
 'ten': 30,
 'employe': 31,
 'tallinn': 32,
 'facil': 33,
 'contrari': 34,
 'earlier': 35,
 'layoff': 36,
 'contract': 37,
 'rank': 38,
 'offic': 39,
 'worker': 40,
 'daili': 41,
 'postime': 42,
 'report': 43,
 'new': 44,
 'plant': 45,
 'would': 46,
 'increas': 47,
 'capac': 48,
 'meet': 49,
 'expect': 50,
 'demand': 51,
 'improv': 52,
 'use': 53,
 'raw': 54,
 'materi': 55,
 'therefor': 56,
 'profit': 57,
 'updat': 58,
 'strategi': 59,
 'year': 60,
 '20092012': 61,
 'baswar': 62,
 'target': 63,
 'longterm': 64,
 'net': 65,
 'sale': 66,
 'growth': 67,
 'r

In [52]:
def transform(text, word_to_idx, max_seq_len):
    tokens = []
    for w in text.split():
        try:
            w_id = word_to_idx[w]
        except:
            w_id = word_to_idx['UNK']
        tokens.append(w_id)

    if len(tokens) < max_seq_len:
        tokens += [word_to_idx['PAD']] * (max_seq_len - len(tokens))
    elif len(tokens) > max_seq_len:
        tokens = tokens[:max_seq_len]
    return tokens

In [53]:
df['content'].tolist()

['accord gran compani plan move product russia although compani grow',
 'technopoli plan develop stage area less 100000 squar meter order host compani work comput technolog telecommun statement said',
 'intern electron industri compani elcoteq laid ten employe tallinn facil contrari earlier layoff compani contract rank offic worker daili postime report',
 'new product plant compani would increas capac meet expect increas demand would improv use raw materi therefor increas product profit',
 'accord compani updat strategi year 20092012 baswar target longterm net sale growth rang 20 40 oper profit margin 10 20 net sale',
 'financ aspocomp growth aspocomp aggress pursu growth strategi increasingli focus technolog demand hdi print circuit board pcb',
 'last quarter 2010 componenta net sale doubl eur131m eur76m period year earlier move zero pretax profit pretax loss eur7m',
 'third quarter 2010 net sale increas 52 eur 2055 mn oper profit 349 eur 235 mn',
 'oper profit rose eur 131 mn eur 87 

In [54]:
val_size = 0.2
test_size = 0.125
is_shuffle = True
texts = df['content'].tolist()
labels = df['sentiment'].tolist()

X_train, X_val, y_train, y_val = train_test_split(
    texts, labels,
    test_size=val_size,
    random_state=seed,
    shuffle=is_shuffle)
X_train, X_test, y_train, y_test = train_test_split(
    X_train, y_train,
    test_size=test_size,
    random_state=seed,
    shuffle=is_shuffle)

In [86]:
class FinancialNews(Dataset):
    def __init__(self, X, y, word_to_idx, max_seq_len, transform=None):
        self.texts = X
        self.labels = y
        self.word_to_idx = word_to_idx
        self.max_seq_len = max_seq_len
        self.transform = transform

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        if self.transform:
            text = self.transform(text, self.word_to_idx, self.max_seq_len)
        text = torch.tensor(text)

        return text, label

In [87]:
max_seq_len = 32

train_dataset = FinancialNews(
    X_train, y_train, word_to_idx, max_seq_len, transform=transform
)

val_dataset = FinancialNews(
    X_val, y_val, word_to_idx, max_seq_len, transform=transform
)

test_dataset = FinancialNews(
    X_test, y_test, word_to_idx, max_seq_len, transform=transform
)

In [88]:
train_dataset.__len__()

3391

In [89]:
train_batch_size = 128
test_batch_size = 8

train_dataloader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=test_batch_size, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=test_batch_size, shuffle=False)

In [90]:
X, y = next(iter(train_dataloader))
y[0]

tensor(0)

In [104]:
class SentimentClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, n_layers, n_classes, dropout_prob):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_size, n_layers, batch_first=True)
        self.norm = nn.LayerNorm(hidden_size)
        self.dropout = nn.Dropout(dropout_prob)
        self.fc1 = nn.Linear(hidden_size, 16)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(16, n_classes)

    def forward(self, x):
        x = self.embedding(x)
        x, hn = self.rnn(x)
        x = x[:, -1, :]
        x = self.norm(x)
        x = self.dropout(x)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

In [105]:
vocab_size = len(vocab)
vocab_size

8908

In [106]:
n_classes = len(list(classes.keys()))

embedding_dim = 64
hidden_size = 64
n_layers = 2
dropout_prob = 0.2

device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = SentimentClassifier(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    hidden_size=hidden_size,
    n_layers=n_layers,
    n_classes=n_classes,
    dropout_prob=dropout_prob
).to(device)

In [107]:
lr = 1e-4
epochs = 50

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

# Hourly Temperature Forecasting

In [2]:
import torch
import torch.nn as nn

seed = 1
torch.manual_seed(seed)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from torch.utils.data import Dataset, DataLoader

In [3]:
dataset_filepath = 'weatherHistory.csv'
df = pd.read_csv(dataset_filepath)
df

Unnamed: 0,Formatted Date,Summary,Precip Type,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Daily Summary
0,2006-04-01 00:00:00.000 +0200,Partly Cloudy,rain,9.472222,7.388889,0.89,14.1197,251.0,15.8263,0.0,1015.13,Partly cloudy throughout the day.
1,2006-04-01 01:00:00.000 +0200,Partly Cloudy,rain,9.355556,7.227778,0.86,14.2646,259.0,15.8263,0.0,1015.63,Partly cloudy throughout the day.
2,2006-04-01 02:00:00.000 +0200,Mostly Cloudy,rain,9.377778,9.377778,0.89,3.9284,204.0,14.9569,0.0,1015.94,Partly cloudy throughout the day.
3,2006-04-01 03:00:00.000 +0200,Partly Cloudy,rain,8.288889,5.944444,0.83,14.1036,269.0,15.8263,0.0,1016.41,Partly cloudy throughout the day.
4,2006-04-01 04:00:00.000 +0200,Mostly Cloudy,rain,8.755556,6.977778,0.83,11.0446,259.0,15.8263,0.0,1016.51,Partly cloudy throughout the day.
...,...,...,...,...,...,...,...,...,...,...,...,...
12295,2007-01-05 07:00:00.000 +0100,Overcast,rain,2.466667,-0.122222,0.99,9.0965,202.0,5.5384,0.0,1018.42,Mostly cloudy until night.
12296,2007-01-05 08:00:00.000 +0100,Partly Cloudy,rain,2.316667,-0.377778,0.99,9.3863,204.0,4.2987,0.0,1018.41,Mostly cloudy until night.
12297,2007-01-05 09:00:00.000 +0100,Clear,rain,2.966667,0.905556,0.99,7.5187,177.0,4.7495,0.0,1019.08,Mostly cloudy until night.
12298,2007-01-05 10:00:00.000 +0100,Partly Cloudy,rain,5.927778,3.522222,0.87,11.3344,243.0,9.6278,0.0,1019.53,Mostly cloudy until night.


In [5]:
univariate_df = df['Temperature (C)']
univariate_df.index = df['Formatted Date']
univariate_df

Unnamed: 0_level_0,Temperature (C)
Formatted Date,Unnamed: 1_level_1
2006-04-01 00:00:00.000 +0200,9.472222
2006-04-01 01:00:00.000 +0200,9.355556
2006-04-01 02:00:00.000 +0200,9.377778
2006-04-01 03:00:00.000 +0200,8.288889
2006-04-01 04:00:00.000 +0200,8.755556
...,...
2007-01-05 07:00:00.000 +0100,2.466667
2007-01-05 08:00:00.000 +0100,2.316667
2007-01-05 09:00:00.000 +0100,2.966667
2007-01-05 10:00:00.000 +0100,5.927778


In [6]:
input_size = 6
label_size = 1
offset = 1

def slicing_window(df, df_start_idx, df_end_idx, input_size, label_size, offset):
    features = []
    labels = []

    window_size = input_size + offset

    if df_end_idx is None:
        df_end_idx = len(df) - window_size

    for idx in range(df_start_idx, df_end_idx):
        feature_end_idx = idx + input_size
        label_start_idx = idx + window_size - label_size

        feature = df[idx:feature_end_idx]
        label = df[label_start_idx:idx + window_size]

        features.append(feature)
        labels.append(label)

    features = np.expand_dims(np.array(features), -1)
    labels = np.array(labels)

    return features, labels

In [7]:
dataset_length = len(univariate_df)
train_size = 0.7
val_size = 0.2
train_end_idx = int(train_size * dataset_length)
val_end_idx = int(val_size * dataset_length) + train_end_idx

X_train, y_train = slicing_window(
    univariate_df,
    df_start_idx=0,
    df_end_idx=train_end_idx,
    input_size=input_size,
    label_size=label_size,
    offset=offset
)

X_val, y_val = slicing_window(
    univariate_df,
    df_start_idx=train_end_idx,
    df_end_idx=val_end_idx,
    input_size=input_size,
    label_size=label_size,
    offset=offset
)

X_test, y_test = slicing_window(
    univariate_df,
    df_start_idx=val_end_idx,
    df_end_idx=None,
    input_size=input_size,
    label_size=label_size,
    offset=offset
)

In [31]:
class WeatherForecast(Dataset):
    def __init__(self, X, y, transform=None):
        self.X = X
        self.y = y
        self.transform = transform

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        X = self.X[idx]
        y = self.y[idx]

        if self.transform:
            X = self.transform(X)

        X = torch.tensor(X, dtype=torch.float32)
        y = torch.tensor(y, dtype=torch.float32)

        return X, y

In [32]:
train_dataset = WeatherForecast(
    X_train, y_train
)

val_dataset = WeatherForecast(
    X_val, y_val
)

test_dataset = WeatherForecast(
    X_test, y_test
)

train_batch_size = 128
test_batch_size = 8

train_dataloader = DataLoader(
    train_dataset,
    batch_size=train_batch_size,
    shuffle=True
)

val_dataloader = DataLoader(
    val_dataset,
    batch_size=test_batch_size,
    shuffle=False
)

test_dataloader = DataLoader(
    test_dataset,
    batch_size=test_batch_size,
    shuffle=False
)

In [33]:
class WeatherForecastor(nn.Module):
    def __init__(self, embedding_dim, hidden_size, n_layers, dropout_prob):
        super().__init__()
        self.rnn = nn.RNN(embedding_dim, hidden_size, n_layers, batch_first=True)
        self.norm = nn.LayerNorm(hidden_size)
        self.dropout = nn.Dropout(dropout_prob)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        x, hn = self.rnn(x)
        x = x[:, -1, :]
        x = self.norm(x)
        x = self.dropout(x)
        x = self.fc(x)

        return x

In [34]:
embedding_dim = 1
hidden_size = 8
n_layers = 3
dropout_prob = 0.2
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = WeatherForecastor(
    embedding_dim=embedding_dim,
    hidden_size=hidden_size,
    n_layers=n_layers,
    dropout_prob=dropout_prob
).to(device)

In [35]:
lr = 1e-3
epochs = 50

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

# Training


In [108]:
def evaluate(model, dataloader, criterion, device):
    model.eval()
    correct = 0
    total = 0
    losses = []
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            y_pred = model(X)
            loss = criterion(y_pred, y)
            losses.append(loss.item())
            _, predicted = torch.max(y_pred.data, 1)
            total += y.size(0)
            correct += (predicted == y).sum().item()
    loss = sum(losses) / len(losses)
    acc = correct / total
    return loss, acc

In [109]:
def fit(model, train_dataloader, val_dataloader, criterion, optimizer, device, epochs):
    train_losses, val_losses = [], []
    for epoch in range(epochs):
        model.train()
        batch_train_loss = []
        for idx, (X, y) in enumerate(train_dataloader):
            X, y = X.to(device), y.to(device)

            optimizer.zero_grad()

            y_pred = model(X)

            loss = criterion(y_pred, y)
            loss.backward()
            optimizer.step()

            batch_train_loss.append(loss)
        train_loss = sum(batch_train_loss) / len(batch_train_loss)
        train_losses.append(train_loss)

        val_loss, val_acc = evaluate(model, val_dataloader, criterion, device)
        val_losses.append(val_loss)

        print(f"Epoch: {epoch + 1} | Train loss: {train_loss} | Val loss: {val_loss}")

    return train_losses, val_losses

In [110]:
train_losses, val_losses = fit(model, train_dataloader, val_dataloader, criterion, optimizer, device, epochs)

Epoch: 1 | Train loss: 0.9961243867874146 | Val loss: 0.9301248060875251
Epoch: 2 | Train loss: 0.933564305305481 | Val loss: 0.9289460504641298
Epoch: 3 | Train loss: 0.9350951313972473 | Val loss: 0.9284534830538953
Epoch: 4 | Train loss: 0.9293509125709534 | Val loss: 0.9284494704887515
Epoch: 5 | Train loss: 0.933385968208313 | Val loss: 0.9288952130763257
Epoch: 6 | Train loss: 0.9344950318336487 | Val loss: 0.9289251526848215
Epoch: 7 | Train loss: 0.9319535493850708 | Val loss: 0.9285263814887063
Epoch: 8 | Train loss: 0.9265627264976501 | Val loss: 0.9286262759419738
Epoch: 9 | Train loss: 0.9308967590332031 | Val loss: 0.9286045295293214
Epoch: 10 | Train loss: 0.9310756921768188 | Val loss: 0.9283192191944748
Epoch: 11 | Train loss: 0.9282813668251038 | Val loss: 0.928244088028298
Epoch: 12 | Train loss: 0.9306309223175049 | Val loss: 0.9283115086985416
Epoch: 13 | Train loss: 0.9279549717903137 | Val loss: 0.9281729322965028
Epoch: 14 | Train loss: 0.9346045255661011 | Val l

In [111]:
val_loss, val_acc = evaluate(
    model,
    val_dataloader,
    criterion,
    device
)

test_loss, test_acc = evaluate(
    model,
    test_dataloader,
    criterion,
    device
)

print('Evaluation on val/test dataset')
print('Val accuracy: ', val_loss)
print('Test accuracy: ', test_loss)

Evaluation on val/test dataset
Val accuracy:  0.8998825757718477
Test accuracy:  0.8847850399916289
