##Team Member:
### Made Oka Resia Wedamerta (m.wedamerta@innopolis.university)
### Mahmoud Mousatat (m.mousatat@innopolis.univeristy)

# News Sentiment Analysis For Company

##1D-CNN


### Implementation of deep learning model

This model's design was chosen because it is simple to use and allows for quick calculation since convolutions may be used in place of recurrence. Convolutions appear to be able to capture a wide range of themes, based on the relative strength of this model; however, the model is constrained in that it only gets to view the text once. Adding an attention mechanism with recurrence after the convolutions, which would enable the model to query specific elements of the headline/body after getting a broad summary from CNN, might be a viable enhancement to this model.


In [None]:
!pip install keras

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [6]:
import pandas as pd
import numpy as np
import keras
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.layers import Dense, Input, Flatten, Dropout, Activation
from keras.layers import Conv1D, MaxPooling1D, Embedding, concatenate
from keras.models import Model
from keras.utils import to_categorical


In [None]:
# Load the data into a Pandas dataframe.
df_content = pd.read_csv('train_bodies.csv')
df_headline = pd.read_csv('train_stances.csv')

df = pd.merge(df_content, df_headline, on="Body ID")

In [None]:
import json

MAX_NB_WORDS = 50000
MAX_SEQUENCE_LENGTH = 250

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(df['Headline'].values)
word_index = tokenizer.word_index
X_headline = tokenizer.texts_to_sequences(df['Headline'].values)
X_headline = pad_sequences(X_headline, maxlen=MAX_SEQUENCE_LENGTH)

# Save the tokenizer to a file
with open('tokenizer_head.json', 'w', encoding='utf-8') as f:
    f.write(tokenizer.to_json())

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(df['articleBody'].values)
word_index = tokenizer.word_index
X_article = tokenizer.texts_to_sequences(df['articleBody'].values)
X_article = pad_sequences(X_article, maxlen=MAX_SEQUENCE_LENGTH)

with open('tokenizer_body.json', 'w', encoding='utf-8') as f:
    f.write(tokenizer.to_json())

Y = pd.get_dummies(df['Stance']).values


In [None]:
df['Stance']

0        unrelated
1        unrelated
2        unrelated
3        unrelated
4        unrelated
           ...    
49967        agree
49968        agree
49969        agree
49970        agree
49971        agree
Name: Stance, Length: 49972, dtype: object

In [None]:
Y[49971]

array([0, 0, 0, 1], dtype=uint8)

In [None]:
VALIDATION_SPLIT = 0.2

indices = np.arange(X_headline.shape[0])
np.random.shuffle(indices)
X_headline = X_headline[indices]
X_article = X_article[indices]
Y = Y[indices]
nb_validation_samples = int(VALIDATION_SPLIT * X_headline.shape[0])

x_headline_train = X_headline[:-nb_validation_samples]
x_article_train = X_article[:-nb_validation_samples]
y_train = Y[:-nb_validation_samples]
x_headline_test = X_headline[-nb_validation_samples:]
x_article_test = X_article[-nb_validation_samples:]
y_test = Y[-nb_validation_samples:]


In [None]:
EMBEDDING_DIM = 100
N_FILTERS = 128
FILTER_SIZE = 5
POOL_SIZE = 4
EPOCHS = 5
BATCH_SIZE = 32

input_headline = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedding_headline = Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH)(input_headline)
conv_headline = Conv1D(N_FILTERS, FILTER_SIZE, activation='relu')(embedding_headline)
pool_headline = MaxPooling1D(POOL_SIZE)(conv_headline)
flatten_headline = Flatten()(pool_headline)

input_article = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedding_article = Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH)(input_article)
conv_article = Conv1D(N_FILTERS, FILTER_SIZE, activation='relu')(embedding_article)
pool_article = MaxPooling1D(POOL_SIZE)(conv_article)
flatten_article = Flatten()(pool_article)

merged = concatenate([flatten_headline, flatten_article])
dropout_layer = Dropout(0.5)(merged)
outputs = Dense(Y.shape[1], activation='softmax')(dropout_layer)

model = Model(inputs=[input_headline, input_article], outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

history = model.fit(x=[x_headline_train, x_article_train], y=y_train, validation_data=([x_headline_test, x_article_test], y_test), epochs=EPOCHS, batch_size=BATCH_SIZE)


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 250)]        0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 250)]        0           []                               
                                                                                                  
 embedding (Embedding)          (None, 250, 100)     5000000     ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, 250, 100)     5000000     ['input_2[0][0]']                
                                                                                              

In [None]:
score, acc = model.evaluate([x_headline_test, x_article_test], y_test, batch_size=BATCH_SIZE)
print('Test score:', score)
print('Test accuracy:', acc)


Test score: 0.4720973074436188
Test accuracy: 0.828096866607666


In [None]:
# Save the entire model as a SavedModel.
!mkdir -p saved_model
model.save('saved_model/cnn_pretrain')



In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp -r "/content/saved_model" "/content/drive/MyDrive/newsforcompany_pretrain/cnn/cnn_pretrain"

In [2]:
import tensorflow as tf
from tensorflow import keras
new_model = tf.keras.models.load_model('/content/drive/MyDrive/newsforcompany_pretrain/cnn/cnn_pretrain')

# Check its architecture
new_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 250)]                0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 250)]                0         []                            
                                                                                                  
 embedding (Embedding)       (None, 250, 100)             5000000   ['input_1[0][0]']             
                                                                                                  
 embedding_1 (Embedding)     (None, 250, 100)             5000000   ['input_2[0][0]']             
                                                                                              

In [None]:
# # Evaluate the restored model
# loss, acc = new_model.evaluate(test_images, test_labels, verbose=2)
# print('Restored model, accuracy: {:5.2f}%'.format(100 * acc))

# print(new_model.predict(test_images).shape)

In [4]:
new_headline = "SoftBank CEO says artificial general intelligence will come within 10 years."
company_profile = "Google LLC is an American multinational technology company focusing on artificial intelligence, online advertising, search engine technology, cloud computing, computer software, quantum computing, e-commerce, and consumer electronics."

In [8]:
import json
from keras.preprocessing.text import tokenizer_from_json

MAX_SEQUENCE_LENGTH = 250
# Load the tokenizer from file
with open('/content/drive/MyDrive/newsforcompany_pretrain/cnn/cnn_tokenizer/tokenizer_headline.json', 'r', encoding='utf-8') as f:
    tokenizer_json = f.read()
    tokenizer = tokenizer_from_json(tokenizer_json)
# Convert the new data to sequences of integers
new_headline_seq = tokenizer.texts_to_sequences([new_headline])[0]
# Pad the sequences to have the same length
new_headline_seq = pad_sequences([new_headline_seq], maxlen=MAX_SEQUENCE_LENGTH)

with open('/content/drive/MyDrive/newsforcompany_pretrain/cnn/cnn_tokenizer/tokenizer_profile.json', 'r', encoding='utf-8') as f:
    tokenizer_json = f.read()
    tokenizer = tokenizer_from_json(tokenizer_json)
company_profile_seq = tokenizer.texts_to_sequences([company_profile])[0]
company_profile_seq = pad_sequences([company_profile_seq], maxlen=MAX_SEQUENCE_LENGTH)

# Preprocess the new data (assume new_headline and new_article are already preprocessed)
new_data = [new_headline_seq, company_profile_seq]
# Make predictions
y_pred = new_model.predict(new_data)

# Get the predicted class
predicted_class = np.argmax(y_pred)

# Get the predicted probability for each class
predicted_probabilities = y_pred[0]

# Print the predicted class and probability for each class
ids_to_labels = {0:"unrelated", 1:"related_neutral", 2:"related_negative", 3:"related_positive"}
predicted_class = [ids_to_labels[i] for i in predictions]
print("Predicted class: ", predicted_class)
print("Predicted probabilities: ", predicted_probabilities)


Predicted class:  3
Predicted probabilities:  [0.04436179 0.06169718 0.3093505  0.58459055]


## BERT Implementation

BERT is a advanced pre-trained word embedding model based on transformer encoded architecture. We utilize BERT as a sentence encoder, which can accurately get the context representation of a sentence. BERT removes the unidirectional constraint using a mask language model (MLM). It randomly masks some of the tokens from the input and predicts the original vocabulary id of the masked word based only. MLM has increased the capability of BERT to outperforms as compared to previous embedding methods. It is a deeply bidirectional system that is capable of handling the unlabelled text by jointly conditioning on both left and right context in all layers. In this research, we have extracted embeddings for a sentence or a set of words or pooling the sequence of hidden-states for the whole input sequence. A deep bidirectional model is more powerful than a shallow left-to-right and right-to-left model.

In [10]:
!pip install transformers


Collecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m64.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m35.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m74.3 MB/s[0m eta [36m0:00:00[0m
Colle

In [11]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score


In [None]:
# Load the data into a Pandas dataframe.
df_content = pd.read_csv('train_bodies.csv')
df_headline = pd.read_csv('train_stances.csv')


In [None]:
df = pd.merge(df_content, df_headline, on="Body ID")

In [None]:
df

Unnamed: 0,Body ID,articleBody,Headline,Stance
0,0,A small meteorite crashed into a wooded area i...,"Soldier shot, Parliament locked down after gun...",unrelated
1,0,A small meteorite crashed into a wooded area i...,Tourist dubbed ‘Spider Man’ after spider burro...,unrelated
2,0,A small meteorite crashed into a wooded area i...,Luke Somers 'killed in failed rescue attempt i...,unrelated
3,0,A small meteorite crashed into a wooded area i...,BREAKING: Soldier shot at War Memorial in Ottawa,unrelated
4,0,A small meteorite crashed into a wooded area i...,Giant 8ft 9in catfish weighing 19 stone caught...,unrelated
...,...,...,...,...
49967,2532,"ANN ARBOR, Mich. – A pizza delivery man in Mic...","Pizza delivery man gets tipped more than $2,00...",agree
49968,2532,"ANN ARBOR, Mich. – A pizza delivery man in Mic...","Pizza delivery man gets $2,000 tip",agree
49969,2532,"ANN ARBOR, Mich. – A pizza delivery man in Mic...","Luckiest Pizza Delivery Guy Ever Gets $2,000 Tip",agree
49970,2532,"ANN ARBOR, Mich. – A pizza delivery man in Mic...",Ann Arbor pizza delivery driver surprised with...,agree


In [None]:
feature = df[['articleBody', 'Headline']]
feature

Unnamed: 0,articleBody,Headline
0,A small meteorite crashed into a wooded area i...,"Soldier shot, Parliament locked down after gun..."
1,A small meteorite crashed into a wooded area i...,Tourist dubbed ‘Spider Man’ after spider burro...
2,A small meteorite crashed into a wooded area i...,Luke Somers 'killed in failed rescue attempt i...
3,A small meteorite crashed into a wooded area i...,BREAKING: Soldier shot at War Memorial in Ottawa
4,A small meteorite crashed into a wooded area i...,Giant 8ft 9in catfish weighing 19 stone caught...
...,...,...
49967,"ANN ARBOR, Mich. – A pizza delivery man in Mic...","Pizza delivery man gets tipped more than $2,00..."
49968,"ANN ARBOR, Mich. – A pizza delivery man in Mic...","Pizza delivery man gets $2,000 tip"
49969,"ANN ARBOR, Mich. – A pizza delivery man in Mic...","Luckiest Pizza Delivery Guy Ever Gets $2,000 Tip"
49970,"ANN ARBOR, Mich. – A pizza delivery man in Mic...",Ann Arbor pizza delivery driver surprised with...


In [None]:
data_classes = ["unrelated", "discuss","disagree","agree"]
label = df['Stance'].apply(data_classes.index)

In [None]:
#Test datasset
# Load the data into a Pandas dataframe.
df_content_test = pd.read_csv('test_bodies.csv')
df_headline_test = pd.read_csv('true_test_stances.csv')

df_test = pd.merge(df_content_test, df_headline_test, on="Body ID")

feature_test = df_test[['articleBody', 'Headline']]

data_classes_test = ["unrelated", "discuss","disagree","agree"]
labels_test = df_test['Stance'].apply(data_classes_test.index)


In [None]:
# Split the data into train and test sets.
train_text, val_text, train_labels, val_labels = train_test_split(feature.iloc[:40000], label.iloc[:40000], test_size=0.2, random_state=42)

# Load the BERT tokenizer.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Tokenize the text inputs.
train_tokens = tokenizer.batch_encode_plus(
    train_text.values.tolist(),
    max_length = 512,
    padding=True,
    truncation=True
)

test_tokens = tokenizer.batch_encode_plus(
    feature_test.values.tolist(),
    max_length = 512,
    padding=True,
    truncation=True
)

# Convert the tokenized inputs to PyTorch tensors.
train_seq = torch.tensor(train_tokens['input_ids'])
train_mask = torch.tensor(train_tokens['attention_mask'])
train_y = torch.tensor(train_labels.tolist())

test_seq = torch.tensor(test_tokens['input_ids'])
test_mask = torch.tensor(test_tokens['attention_mask'])
test_y = torch.tensor(labels_test.values.tolist())

# Create a DataLoader for our training set.
train_data = TensorDataset(train_seq, train_mask, train_y)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=16)

# Create a DataLoader for our test set.
test_data = TensorDataset(test_seq, test_mask, test_y)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=16)


In [None]:
# Load the pre-trained BERT model for sequence classification.
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = 4,
    output_attentions = False,
    output_hidden_states = False
)

# Define the optimizer and learning rate scheduler.
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

epochs = 3
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps = 0,
    num_training_steps = total_steps
)


Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
!nvidia-smi

Sun Apr 16 21:53:33 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P8     9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
import torch
torch.cuda.is_available()

True

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Set the model to the device (GPU or CPU).
model.to(device)

# Train the model.
for epoch in range(epochs):
    print('\nEpoch {}/{}'.format(epoch + 1, epochs))
    print('-' * 10)

    # Set the model to training mode.
    model.train()

    total_loss = 0

    for step, batch in enumerate(train_dataloader):
        # Unpack the batch.
        batch_seq, batch_mask, batch_y = batch

        # Set the batch to the device.
        batch_seq = batch_seq.to(device)
        batch_mask = batch_mask.to(device)
        batch_y = batch_y.to(device)

        # Zero out any gradients.
        model.zero_grad()

        # Perform a forward pass.
        outputs = model(batch_seq, attention_mask=batch_mask, labels=batch_y)
        loss = outputs[0]
        total_loss += loss.item()

        # Perform a backward pass and optimize.
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    # Calculate the average loss for this epoch.
    avg_loss = total_loss / len(train_dataloader)
    print('Average loss: {:.2f}'.format(avg_loss))




Epoch 1/3
----------
Average loss: 0.28

Epoch 2/3
----------
Average loss: 0.11

Epoch 3/3
----------
Average loss: 0.05


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Finally, we'll evaluate the model on the test set:

# Set the model to evaluation mode.
model.eval()

# Create lists to store the predictions and true labels.
preds = []
true_labels = []

# Evaluate the model on the test set.
for batch in test_dataloader:
    # Unpack the batch.
    batch_seq, batch_mask, batch_y = batch

    # Set the batch to the device.
    batch_seq = batch_seq.to(device)
    batch_mask = batch_mask.to(device)
    batch_y = batch_y.to(device)

    # Disable gradient calculations.
    with torch.no_grad():
        # Perform a forward pass.
        outputs = model(batch_seq, attention_mask=batch_mask)
        logits = outputs[0]
        probs = torch.softmax(logits, dim=1)
        preds.extend(torch.argmax(probs, dim=1).tolist())
        true_labels.extend(batch_y.tolist())

# Calculate the F1 score and accuracy on the test set.
f1 = f1_score(true_labels, preds, average='weighted')
acc = accuracy_score(true_labels, preds)

print('F1 score: {:.2f}'.format(f1))
print('Accuracy: {:.2f}'.format(acc))


F1 score: 0.90
Accuracy: 0.91


In [None]:
model.save_pretrained('pretrained_model/')

In [None]:
!cp -r "/content/pretrained_model" "/content/drive/MyDrive/newsforcompany_pretrain/bert"

In [12]:
# Load the BERT tokenizer.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
unique_labels = set()

tester = BertForSequenceClassification.from_pretrained('/content/drive/MyDrive/newsforcompany_pretrain/bert')

ids_to_labels = {0:"unrelated", 1:"related_neutral", 2:"related_negative", 3:"related_positive"}

data = {'CompanyProfile':["Google LLC is an American multinational technology company focusing on artificial intelligence, online advertising, search engine technology, cloud computing, computer software, quantum computing, e-commerce, and consumer electronics."], 'Headline':["SoftBank CEO says artificial general intelligence will come within 10 years."]}
df_one = pd.DataFrame(data)
def evaluate_one_text(model, sentence):

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    text = tokenizer.batch_encode_plus(sentence.values.tolist(), padding='max_length', max_length = 512, truncation=True)

    # Convert the tokenized inputs to PyTorch tensors.
    input_id = torch.tensor(text['input_ids']).to(device)
    mask = torch.tensor(text['attention_mask']).to(device)

    logits = model(input_id, mask, None)
    logits_clean = logits[0]

    predictions = logits_clean.argmax(dim=1).tolist()
    prediction_label = [ids_to_labels[i] for i in predictions]
    print(sentence)
    print(prediction_label)

evaluate_one_text(tester, df_one)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

                                      CompanyProfile  \
0  Google LLC is an American multinational techno...   

                                            Headline  
0  SoftBank CEO says artificial general intellige...  
['related_positive']
