In [1]:
from transformers import BertForSequenceClassification, BertTokenizer
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from transformers import pipeline


import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# bert-base-uncased
# distilbert-base-uncased
# roberta-base

def BERT_score(model):
    model_name = model
    if model == 'roberta-base':
        tokenizer = RobertaTokenizer.from_pretrained(model_name, num_labels=1)
        model = RobertaForSequenceClassification.from_pretrained(model_name)
        classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

    else:
        model = BertForSequenceClassification.from_pretrained(model_name, num_labels=1)
        tokenizer = BertTokenizer.from_pretrained(model_name)
        classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

    return classifier

In [None]:
# bert-base-uncased
classifier1 = BERT_score('bert-base-uncased')
df1 = pd.read_csv('./crawler/google_news_crawling.csv')
df1['sentiment_label'] = df1['title'].apply(lambda x: classifier1(x)[0]['label'])
df1['sentiment_score'] = df1['title'].apply(lambda x: classifier1(x)[0]['score'])

In [4]:
# distilbert-base-uncased
classifier2 = BERT_score('distilbert-base-uncased')
df2 = pd.read_csv('../crawler/google_news_crawling.csv')
df2['sentiment_label'] = df2['title'].apply(lambda x: classifier2(x)[0]['label'])
df2['sentiment_score'] = df2['title'].apply(lambda x: classifier2(x)[0]['score'])

You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'enc

In [None]:
# roberta-base
classifier3 = BERT_score('roberta-base')
df3 = pd.read_csv('../crawler/google_news_crawling.csv')
df3['sentiment_label'] = df3['title'].apply(lambda x: classifier3(x)[0]['label'])
df3['sentiment_score'] = df3['title'].apply(lambda x: classifier3(x)[0]['score'])

In [None]:
df1['sentiment_score'].plot.hist(xlim=(0, 1), bins=10, title='BERT-base-uncased')
plt.savefig('../figure/BERT-base-uncased.png')

In [None]:
df2['sentiment_score'].plot.hist(xlim=(0, 1), bins=10, title='distilbert-base-uncased')
plt.savefig('../figure/distilbert-base-uncased.png')

In [None]:
df3['sentiment_score'].plot.hist(xlim=(0, 1), bins=10, title='roberta-base')
plt.savefig('../figure/roberta-base.png')

# Distil-BERT Select

In [None]:
df2['sentiment_score'].mean()

In [None]:
from sklearn.preprocessing import MinMaxScaler

df2['sentiment_score_scaled'] = MinMaxScaler().fit_transform(df2['sentiment_score'].values.reshape(-1, 1))

In [None]:
df2['sentiment_score_scaled'].plot(kind='hist', bins=10, title='distilbert-base-uncased')
plt.savefig('../figure/distilbert-base-uncased_scaled.png')

In [None]:
noise_temp = df2[['Date', 'sentiment_score_scaled']]

In [None]:
noise_aggregated = noise_temp.groupby('Date')['sentiment_score_scaled'].apply(list).reset_index()

In [None]:
max_length = 10
expanded_cols = {
    f'sentiment_{i+1}': pd.Series([x[i] if i < len(x) else 0 for x in noise_aggregated['sentiment_score_scaled']])
    for i in range(max_length)
}


noise_expanded = pd.DataFrame(expanded_cols)
noise_expanded.insert(0, 'Date', noise_aggregated['Date'])

noise_expanded

In [None]:
noise_expanded['Date'] = pd.to_datetime(noise_expanded['Date'], format='%Y%m%d').astype(str)

In [None]:
temp_stock = pd.read_csv('../data/AAPL.csv')

In [None]:
noise_date_info = temp_stock[['Date']].copy()

In [None]:
noise_data = pd.merge(noise_date_info, noise_expanded, left_on='Date', right_on='Date', how='left')

In [None]:
noise_data.isna().sum()

In [None]:
noise_data.fillna(0, inplace=True)

In [None]:
noise_data.to_csv('../data/noise_data.csv', index=False)

In [None]:
noise_data