## Importing necessary libraries

In [1]:
import pandas as pd
from transformers import pipeline
from multiprocessing.dummy import Pool as ThreadPool

from tqdm import tqdm

## **Tweet dataset 1**

[Kaggle Link](https://www.kaggle.com/datasets/slythe/apple-twitter-sentiment-crowdflower)

In [2]:
df = pd.read_csv("socials/Apple-Twitter-Sentiment-DFE.csv",  encoding='latin1')

In [3]:
df.head(2)

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,sentiment,sentiment:confidence,date,id,query,sentiment_gold,text
0,623495513,True,golden,10,,3,0.6264,Mon Dec 01 19:30:03 +0000 2014,5.4e+17,#AAPL OR @Apple,3\nnot_relevant,#AAPL:The 10 best Steve Jobs emails ever...htt...
1,623495514,True,golden,12,,3,0.8129,Mon Dec 01 19:43:51 +0000 2014,5.4e+17,#AAPL OR @Apple,3\n1,RT @JPDesloges: Why AAPL Stock Had a Mini-Flas...


In [4]:
df = df[['date', 'text']]

In [5]:
df

Unnamed: 0,date,text
0,Mon Dec 01 19:30:03 +0000 2014,#AAPL:The 10 best Steve Jobs emails ever...htt...
1,Mon Dec 01 19:43:51 +0000 2014,RT @JPDesloges: Why AAPL Stock Had a Mini-Flas...
2,Mon Dec 01 19:50:28 +0000 2014,My cat only chews @apple cords. Such an #Apple...
3,Mon Dec 01 20:26:34 +0000 2014,I agree with @jimcramer that the #IndividualIn...
4,Mon Dec 01 20:29:33 +0000 2014,Nobody expects the Spanish Inquisition #AAPL
...,...,...
3881,Tue Dec 09 22:08:53 +0000 2014,(Via FC) Apple Is Warming Up To Social Media -...
3882,Tue Dec 09 22:18:27 +0000 2014,RT @MMLXIV: there is no avocado emoji may I as...
3883,Tue Dec 09 23:45:59 +0000 2014,@marcbulandr I could not agree more. Between @...
3884,Wed Dec 10 00:48:10 +0000 2014,My iPhone 5's photos are no longer downloading...


In [6]:
df['date'] = pd.to_datetime(df['date'], format='%a %b %d %H:%M:%S %z %Y').dt.tz_convert(None)

In [7]:
df

Unnamed: 0,date,text
0,2014-12-01 19:30:03,#AAPL:The 10 best Steve Jobs emails ever...htt...
1,2014-12-01 19:43:51,RT @JPDesloges: Why AAPL Stock Had a Mini-Flas...
2,2014-12-01 19:50:28,My cat only chews @apple cords. Such an #Apple...
3,2014-12-01 20:26:34,I agree with @jimcramer that the #IndividualIn...
4,2014-12-01 20:29:33,Nobody expects the Spanish Inquisition #AAPL
...,...,...
3881,2014-12-09 22:08:53,(Via FC) Apple Is Warming Up To Social Media -...
3882,2014-12-09 22:18:27,RT @MMLXIV: there is no avocado emoji may I as...
3883,2014-12-09 23:45:59,@marcbulandr I could not agree more. Between @...
3884,2014-12-10 00:48:10,My iPhone 5's photos are no longer downloading...


In [8]:
df['date'].dt.date.nunique()

10

In [9]:
df

Unnamed: 0,date,text
0,2014-12-01 19:30:03,#AAPL:The 10 best Steve Jobs emails ever...htt...
1,2014-12-01 19:43:51,RT @JPDesloges: Why AAPL Stock Had a Mini-Flas...
2,2014-12-01 19:50:28,My cat only chews @apple cords. Such an #Apple...
3,2014-12-01 20:26:34,I agree with @jimcramer that the #IndividualIn...
4,2014-12-01 20:29:33,Nobody expects the Spanish Inquisition #AAPL
...,...,...
3881,2014-12-09 22:08:53,(Via FC) Apple Is Warming Up To Social Media -...
3882,2014-12-09 22:18:27,RT @MMLXIV: there is no avocado emoji may I as...
3883,2014-12-09 23:45:59,@marcbulandr I could not agree more. Between @...
3884,2014-12-10 00:48:10,My iPhone 5's photos are no longer downloading...


## **Tweet Dataset 2**

[Kaggle Link](https://www.kaggle.com/datasets/omermetinn/tweets-about-the-top-companies-from-2015-to-2020?select=Tweet.csv)

In [10]:
df2 = pd.read_csv("socials/Tweet.csv")

In [11]:
df2.head(2)

Unnamed: 0,tweet_id,writer,post_date,body,comment_num,retweet_num,like_num
0,550441509175443456,VisualStockRSRC,1420070457,"lx21 made $10,008 on $AAPL -Check it out! htt...",0,0,1
1,550441672312512512,KeralaGuy77,1420070496,Insanity of today weirdo massive selling. $aap...,0,0,0


In [12]:
# Convert to datetime format
df2['date'] = pd.to_datetime(df2['post_date'], unit='s')

# Include only date and headline column
df2 = df2[["date", "body"]]

# Renaming body column as text
df2.rename(columns={"body": "text"}, inplace=True)

In [13]:
# include only 2015-2019 dataset
df2 = df2[df2['date'].dt.year <= 2019]

In [14]:
# Removing Symbols

df2['text'] = df2['text'].str.replace(r'[\$#@]', '', regex=True).str.lower()

In [15]:
# Filtering only apple data

keywords = r'\b(aapl|apple|iphone)\b'

df2 = df2[df2['text'].str.contains(keywords, case=False, na=False, regex=True)]

df2.shape

  df2 = df2[df2['text'].str.contains(keywords, case=False, na=False, regex=True)]


(1488176, 2)

In [16]:
df2.head(2)

Unnamed: 0,date,text
0,2015-01-01 00:00:57,"lx21 made 10,008 on aapl -check it out! http:..."
1,2015-01-01 00:01:36,insanity of today weirdo massive selling. aapl...


In [17]:
df2 = df2.reset_index(drop=True)
df2.tail(2)

Unnamed: 0,date,text
3717962,2019-12-31 23:55:37,i don't discriminate. i own both aapl and msft.
3717963,2019-12-31 23:55:53,"aapl patent 10,522,475 vertical interconnects ..."


In [18]:
df2['date'].dt.date.nunique()

1826

## **Reddit Dataset - 3**

[Kaggle Link](https://www.kaggle.com/datasets/pavellexyr/five-years-of-aapl-on-reddit?select=five-years-of-aapl-on-reddit-comments.csv)

In [10]:
df3 = pd.read_csv("socials/five-years-of-aapl-on-reddit-comments.csv")

In [11]:
df3.head(2)

Unnamed: 0,type,id,subreddit.id,subreddit.name,subreddit.nsfw,created_utc,permalink,body,sentiment,score
0,comment,hitr97r,2qjfk,stocks,False,1635724579,https://old.reddit.com/r/stocks/comments/qjvo1...,I own all 3. Don't sell AAPL.,0.0,1
1,comment,hitq83x,2qjfk,stocks,False,1635724042,https://old.reddit.com/r/stocks/comments/qj07j...,I believe TSLA want to be like AAPL: part hard...,-0.25,2


In [12]:
# Convert to datetime format
df3['date'] = pd.to_datetime(df3['created_utc'], unit='s')

# Include only date and headline column
df3 = df3[["date", "body"]]

# Renaming body column as text
df3.rename(columns={"body": "text"}, inplace=True)

In [13]:
# include only 2016-2019 dataset
df3 = df3[df3['date'].dt.year <= 2019]

In [14]:
df3.reset_index(drop=True)
df3.tail(2)

Unnamed: 0,date,text
297531,2016-11-01 03:00:35,Microsoft's growth opportunity has nothing to ...
297532,2016-11-01 02:00:05,I'm a landman. AAPL-RPL. You can counter (and...


In [15]:
df3['date'].dt.date.nunique()

1155

# **FinBERT Transformer model for social sentiment analysis**

In [16]:
import torch

# Use GPU if available, otherwise CPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda:0


In [17]:
# LOAD FINBERT MODEL

from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

model_name = "yiyanghkust/finbert-tone"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

finbert = pipeline("sentiment-analysis", model=model, device=device, tokenizer=tokenizer, truncation=True, max_length=512)

Device set to use cuda:0


In [18]:
# === Process in batches ===
def batch_inference(texts, model_pipeline, batch_size=32):
    results = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Processing batches"):
        batch = texts[i:i + batch_size]
        batch_results = model_pipeline(batch)
        results.extend(batch_results)
    return results

In [19]:
# For dataset 1
texts = df['text'].astype(str).tolist()

results = batch_inference(texts, finbert, batch_size=32)

# If you want to split labels and scores
labels = [r['label'] for r in results]
scores = [r['score'] for r in results]

df['sentiment_label'] = labels
df['social_sentiment'] = scores

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset  2.09s/it]
Processing batches: 100%|████████████████████████████████████████████████████████████| 122/122 [02:47<00:00,  1.37s/it]


In [None]:
# For dataset 2
texts = df2['text'].astype(str).tolist()

results = batch_inference(texts, finbert, batch_size=32)

# If you want to split labels and scores
labels = [r['label'] for r in results]
scores = [r['score'] for r in results]

df2['sentiment_label'] = labels
df2['social_sentiment'] = scores

Processing batches:  11%|██████▏                                               | 5298/46506 [54:18<16:48:05,  1.47s/it]

In [20]:
# For dataset 3
texts = df3['text'].astype(str).tolist()

results = batch_inference(texts, finbert, batch_size=32)

# If you want to split labels and scores
labels = [r['label'] for r in results]
scores = [r['score'] for r in results]

df3['sentiment_label'] = labels
df3['social_sentiment'] = scores

Processing batches: 100%|██████████████████████████████████████████████████████████| 1810/1810 [43:53<00:00,  1.45s/it]


## **Merging Social Trend Datasets**

In [21]:
final = pd.concat([df, df3])

In [23]:
final.shape

(61776, 4)

In [24]:
final = final.sort_values(by='date', ascending=True).reset_index(drop=True)

In [25]:
final['sentiment_label'].value_counts()

sentiment_label
Neutral     46574
Positive     8603
Negative     6599
Name: count, dtype: int64

In [26]:
final.to_csv("Datasets/finbert_socials_extracted_sentiment.csv")

In [27]:
final['date_only'] = final['date'].dt.date

In [28]:
final.head(2)

Unnamed: 0,date,text,sentiment_label,social_sentiment,date_only
0,2014-12-01 19:30:03,#AAPL:The 10 best Steve Jobs emails ever...htt...,Neutral,0.977165,2014-12-01
1,2014-12-01 19:43:51,RT @JPDesloges: Why AAPL Stock Had a Mini-Flas...,Neutral,0.975126,2014-12-01


In [29]:
daily_avg_socials = final.groupby('date_only')['social_sentiment'].mean().reset_index()

In [30]:
daily_avg_socials.shape

(1165, 2)

In [32]:
daily_avg_socials.rename(columns={'date_only' : 'Date'}, inplace=True)

In [33]:
daily_avg_socials

Unnamed: 0,Date,social_sentiment
0,2014-12-01,0.964340
1,2014-12-02,0.978057
2,2014-12-03,0.972873
3,2014-12-04,0.980858
4,2014-12-05,0.980136
...,...,...
1160,2019-12-27,0.953350
1161,2019-12-28,0.973798
1162,2019-12-29,0.953637
1163,2019-12-30,0.944577


In [34]:
daily_avg_socials.to_csv("Datasets/finbert_social_sentiment.csv")