In [1]:
# !pip install textblob

# !pip install spacy
# !python -m textblob.download_corpora
# !python -m spacy download en_core_web_sm

In [2]:
import pandas as pd
import numpy as np
import datetime as dt
from pathlib import Path

In [3]:
#Read data from Company_Tweet.csv
Company_Tweet_df = pd.read_csv(Path("Resources/Company_Tweet.csv"))
Company_Tweet_df = Company_Tweet_df.set_index('tweet_id')
Company_Tweet_df.sample()

Unnamed: 0_level_0,ticker_symbol
tweet_id,Unnamed: 1_level_1
608720439712489472,AAPL


In [4]:
#Read data from Tweet.csv
tweet_df = pd.read_csv(Path("Resources/Tweet.csv"))
tweet_df = tweet_df.set_index('tweet_id')
tweet_df.sample()

Unnamed: 0_level_0,writer,post_date,body,comment_num,retweet_num,like_num
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
608967557559820288,Zercatto,1434024154,Apple Pay to Link to Loyalty Cards and Roll Ou...,0,0,0


In [5]:
#merge to df into one
Company_tweet_result = tweet_df.merge(Company_Tweet_df, left_index=True, right_index=True)

In [6]:
#select relavent ticker and column.
selected_df = Company_tweet_result[Company_tweet_result['ticker_symbol'].isin(['AAPL', 'TSLA', 'AMZN'])]
selected_df = selected_df[['ticker_symbol', 'post_date','body']]
selected_df.sample()

Unnamed: 0_level_0,ticker_symbol,post_date,body
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
565231231843856384,AAPL,1423596601,$AAPL - GreenRedSignal-AAPL -


In [7]:
# define Start date & end date to use for boolean indexing to filter the rows
start_date = '2019-01-01 00:00:00'
end_date = '2022-12-31 23:59:59'

In [8]:
#convert date into standard format
selected_df['post_date'] = pd.to_datetime(selected_df['post_date'], unit='s')

#filter the data based on post date ranging between start and end date
#rename columns to date, hashtags, text to make the dataframe consistent for concatenation
stock_tweet_df = selected_df[(selected_df['post_date'] >= start_date) & (selected_df['post_date'] <= end_date)]
stock_tweet_df = stock_tweet_df[['ticker_symbol', 'post_date', 'body']]
stock_tweet_df.columns = ['hashtags', 'date', 'text']
stock_tweet_df = stock_tweet_df.set_index('date')
stock_tweet_df.dropna(inplace = True)
stock_tweet_df.head()

Unnamed: 0_level_0,hashtags,text
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-01-01 00:00:30,TSLA,!! 8 Hours Left !!The picture you see here is...
2019-01-01 00:03:05,TSLA,$3750 would be the minimum discount in any cas...
2019-01-01 00:03:20,TSLA,There are FUDking analysts like those from gol...
2019-01-01 00:04:35,TSLA,I really hope they report ‘unbelievable’ numbe...
2019-01-01 00:07:33,TSLA,#TESLA : $TSLA Dec-31 Update #StockMarket #Tec...


In [9]:
# Read data from Ethereum_tweets.csv 
ethereum_tweets = pd.read_csv(
    Path("Resources/Ethereum_tweets.csv",
    index_col='date', 
    parse_dates=True, 
    infer_datetime_format=True
)).dropna()
ethereum_tweets['hashtags'] = 'ETH'
ethereum_tweets = ethereum_tweets[["date", "hashtags", "text"]]
ethereum_tweets.head(5)

Unnamed: 0,date,hashtags,text
1,11/15/2021 7:26:35,ETH,"Adopted a Hypocat #053, “Crazy Rich Cat” 🤑💵💵💵🤩..."
7,11/15/2021 7:25:36,ETH,"I've just minted ""Waterfall graph "" collectibl..."
9,11/15/2021 7:25:14,ETH,You can do Cloud Mining using this site https:...
13,11/15/2021 7:24:48,ETH,@michaeljburry @elonmusk @BernieSanders Just #...
15,11/15/2021 7:24:36,ETH,Top 10 coins by 3-months Twitter Volume \n\n$b...


In [10]:
btc_tweet = pd.read_csv(Path("Resources/Bitcoin_tweets.csv"),usecols = ["date", "hashtags", "text"], 
                           index_col=["date"],
                            parse_dates=["date"])
btc_tweet = btc_tweet[(btc_tweet.index <= end_date) |(btc_tweet.index >= start_date)]
btc_tweet.head(5)

  btc_tweet = pd.read_csv(Path("Resources/Bitcoin_tweets.csv"),usecols = ["date", "hashtags", "text"],


Unnamed: 0_level_0,text,hashtags
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-02-10 23:59:04,Blue Ridge Bank shares halted by NYSE after #b...,['bitcoin']
2021-02-10 23:58:48,"😎 Today, that's this #Thursday, we will do a ""...","['Thursday', 'Btc', 'wallet', 'security']"
2021-02-10 23:54:48,"Guys evening, I have read this article about B...",
2021-02-10 23:54:33,$BTC A big chance in a billion! Price: \487264...,"['Bitcoin', 'FX', 'BTC', 'crypto']"
2021-02-10 23:54:06,This network is secured by 9 508 nodes as of t...,['BTC']


In [11]:
#clean the tweets data by dropping null values and fetching tweets specific to bitcoin  
tags = ['bitcoin','btc','crypto']  
hashtags = '|'.join(tags)
btc_tweet.dropna(inplace = True)
btc_tweet_df = btc_tweet[btc_tweet['hashtags'].str.lower().str.contains(hashtags)]

#replace the hashtags values with 'BTC'
btc_tweet_df['hashtags'] = 'BTC'
btc_tweet_df = btc_tweet_df[['hashtags', 'text']]
btc_tweet_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  btc_tweet_df['hashtags'] = 'BTC'


Unnamed: 0_level_0,hashtags,text
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-02-10 23:59:04,BTC,Blue Ridge Bank shares halted by NYSE after #b...
2021-02-10 23:58:48,BTC,"😎 Today, that's this #Thursday, we will do a ""..."
2021-02-10 23:54:33,BTC,$BTC A big chance in a billion! Price: \487264...
2021-02-10 23:54:06,BTC,This network is secured by 9 508 nodes as of t...
2021-02-10 23:53:30,BTC,💹 Trade #Crypto on #Binance \n\n📌 Enjoy #Cashb...


In [82]:
import datetime

# Read data from Emusk_2021_tweets.csv
elon_tweets = pd.read_csv('Resources/Emusk_2021_tweets.csv',
                          infer_datetime_format=True,
                          parse_dates=True,
                         usecols = ['Datetime','Text'])
elon_tweets['hashtags'] = 'DOGE'
elon_tweets.columns = ['date', 'text', 'hashtags']

#convert date column into format of yyyy-mm-dd hh:mm:ss
elon_tweets['date'] = elon_tweets['date']+':00'

#rename columns and display sample
elon_tweets = elon_tweets[['date', 'hashtags', 'text']]#.set_index('date')
elon_tweets.head()

Unnamed: 0,date,hashtags,text
0,01/01/2021 00:58:00,DOGE,"@PPathole Dojo isn’t needed, but will make sel..."
1,02/01/2021 03:20:00,DOGE,@comma_ai Tesla Full Self-Driving will work at...
2,02/01/2021 12:23:00,DOGE,"@newscientist Um, we have giant fusion reactor..."
3,02/01/2021 14:51:00,DOGE,So proud of the Tesla team for achieving this ...
4,02/01/2021 14:59:00,DOGE,@flcnhvy Tesla is responsible for 2/3 of all t...


In [83]:
#create function to resolve issue with datetime and series and convert the column values in required format
def date_format(date_col)
for i in np.arange(len(elon_tweets['date'])):
    elon_tweets['date'][i] = datetime.datetime.strptime(elon_tweets['date'][i],"%d/%m/%Y %H:%M:%S")
    elon_tweets['date'][i].strftime("%Y-%m-%d %H:%M:%S")

#set index & display df
elon_tweets.set_index('date')
elon_tweets.head()

Unnamed: 0,date,hashtags,text
0,2021-01-01 00:58:00,DOGE,"@PPathole Dojo isn’t needed, but will make sel..."
1,2021-01-02 03:20:00,DOGE,@comma_ai Tesla Full Self-Driving will work at...
2,2021-01-02 12:23:00,DOGE,"@newscientist Um, we have giant fusion reactor..."
3,2021-01-02 14:51:00,DOGE,So proud of the Tesla team for achieving this ...
4,2021-01-02 14:59:00,DOGE,@flcnhvy Tesla is responsible for 2/3 of all t...


In [84]:
#exporting stocks and btc_tweets dataframe records to csv files as the raw files are heavy 
tweet_df.to_csv("Resources/btc_tweets.csv", header=True, index=True)
stock_tweet_df.to_csv('Resources/stock_tweet_data.csv', header=True, index=True)

In [None]:
stock_tweet_df = pd.read_csv(
    Path("Resources/stock_tweet_data.csv"),
    index_col='date', 
)
stock_tweet_df.sample()

In [92]:
# Import doge prices csv
doge_prices = pd.read_csv('Resources/dogecoin_prices_2021.csv', 
    infer_datetime_format=True, 
    parse_dates=True)

# display dataframe
doge_prices.head()

Unnamed: 0,open_time,price
0,01/01/2021 00:00,0.004672
1,01/01/2021 00:01,0.004673
2,01/01/2021 00:02,0.004686
3,01/01/2021 00:03,0.004671
4,01/01/2021 00:04,0.004676


In [94]:
#fetching closing prices for Doge
# doge_prices = doge_prices[doge_prices['open_time'].str.contains('23:59')]
# doge_prices.columns = ['date', 'price']

# display dataframe
doge_prices.set_index('date')
doge_prices.head()

Unnamed: 0,date,price
1439,01/01/2021 23:59,0.005665
2879,02/01/2021 23:59,0.01058
4319,03/01/2021 23:59,0.00981
5759,04/01/2021 23:59,0.009771
7199,05/01/2021 23:59,0.009994


## Forget how to concat the df @shweta can you help? But i saved the origin csv so i read the saved csv.

In [None]:
sentiment_analysis_df = pd.read_csv(
    Path("Resources/sentiment_analysis_df.csv"))
sentiment_analysis_df.sample()

#### Use sample data to test the code

In [None]:
sample_sentiment_analysis_df=sentiment_analysis_df.sample(300)

In [None]:
#get rid of links and hashtags
sample_sentiment_analysis_df["text"] = sample_sentiment_analysis_df["text"].apply(lambda x : ' '.join([s for s in x.split(' ') if s.find('@') == -1 and s.find('www') == -1 and s.find('https') == -1]))

#get rid of non-ascii characters
sample_sentiment_analysis_df = sample_sentiment_analysis_df.replace(r'\W+', ' ', regex=True)


# get rid of tweet duplicates to not bias the sentiment analysis
sample_sentiment_analysis_df = sample_sentiment_analysis_df.drop_duplicates(subset=['text'])
sample_sentiment_analysis_df

In [None]:
# Tokenize the body text
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

# Sample text
sample_sentiment_analysis_df['tokenized_body'] = sample_sentiment_analysis_df['text'].apply(word_tokenize)

# Tokenize
sample_sentiment_analysis_df.sample()

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

def remove_stopwords(tokens):
    return [word for word in tokens if word.lower() not in stopwords.words('english')]

sample_sentiment_analysis_df['tokenized_body'] = sample_sentiment_analysis_df['tokenized_body'].apply(remove_stopwords)

sample_sentiment_analysis_df.sample()

### In this code, we're using SpaCy for tokenization and part-of-speech tagging, and TextBlob for sentiment analysis. We define a custom analyze_sentiment function that takes a text input, processes it with SpaCy and TextBlob, and returns the sentiment label and polarity score.

In [None]:
import spacy
from textblob import TextBlob

nlp = spacy.load('en_core_web_sm')

def analyze_sentiment(text):
    doc = nlp(text)
    blob = TextBlob(text)
    
    polarity = blob.sentiment.polarity
    
    if polarity > 0:
        sentiment = 'positive'
    elif polarity < 0:
        sentiment = 'negative'
    else:
        sentiment = 'neutral'
    
    return sentiment, polarity

# Assuming df is your DataFrame with 'text' column
sample_sentiment_analysis_df['sentiment'], sample_sentiment_analysis_df['polarity'] = zip(*sample_sentiment_analysis_df['text'].apply(analyze_sentiment))

df_sentiment = sample_sentiment_analysis_df.sort_values('polarity').reset_index(drop=True)
df_sentiment.sample()


In [None]:
df_sentiment["sentiment"].value_counts()

In [None]:
positive_count = df_sentiment[df_sentiment['polarity'] > 0]['polarity'].count()
negative_count = df_sentiment[df_sentiment['polarity'] < 0]['polarity'].count()
neutral_count = df_sentiment[df_sentiment['polarity'] == 0]['polarity'].count()

print("Positive count:", positive_count)
print("Negative count:", negative_count)
print("Neutral count:", neutral_count)

In [None]:
# Let us extract the most common words found in both positive and negative positive reviews
df_neg = df_sentiment[df_sentiment['polarity'] < 0]
df_pos = df_sentiment[df_sentiment['polarity'] > 0]

In [None]:
# Amount of positive and negative reviews we have been inferring from our data,
# Let's a have general idea about the opinion of the public regarding tweets:
print("Negative reviews", len(df_neg))
print("Positive reiews", len(df_pos))

In [None]:
from collections import Counter

# Extracting the most common words found in both positive and negative positive reviews:
positive_words = pd.DataFrame([dict(Counter(' '.join(df_pos['text'].values.tolist()).split(' ')))]).T.sort_values(0, ascending=False)[0:100].index

negative_words = pd.DataFrame([dict(Counter(' '.join(df_neg['text'].values.tolist()).split(' ')))]).T.sort_values(0, ascending=False)[0:100].index

In [None]:
display("Most common words in POSITIVE tweets on ETH:",positive_words)
display("Most common words in NEGATIVE tweets on ETH:",negative_words)

## Another way to clean the data
#### Before analyzing the content of the tweets, we are first going to preprocess our data even more. There are several preprocessing strategies we are going to:

#Lemmatize each word
#Delete extra characters
#Remove stop words

In [None]:
import spacy
from nltk.tokenize import RegexpTokenizer
import re
from nltk.corpus import stopwords

# Load the English language model
nlp = spacy.load("en_core_web_sm")

def preprocess(sentence, stemming=False, lemmatizing=False):
    global counter
    counter += 1
    if counter % 100 == 0:
        pass
        # print(counter)

    sentence = str(sentence)
    tokenizer = RegexpTokenizer(r'\w+')

    sentence = sentence.lower()
    sentence = sentence.replace('{html}', "")
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    rem_url = re.sub(r'http\S+', '', cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    tokens = tokenizer.tokenize(rem_num)

    filtered_words = [w for w in tokens if len(w) > 2 and w not in stopwords.words('english')]

    if lemmatizing:
        doc = nlp(" ".join(filtered_words))
        lemma_words = [token.lemma_ for token in doc if not token.is_punct and not token.is_space and not token.is_stop]
        return " ".join(lemma_words)

    if stemming:
        stemmer = PorterStemmer()
        stem_words = [stemmer.stem(w) for w in filtered_words]
        return " ".join(stem_words)

    return " ".join(filtered_words)

# Example usage
df_sentiment['text'] = df_sentiment['text'].apply(lambda x: preprocess(x, stemming=False, lemmatizing=True))
df_sentiment

### Stock price data cleaning

In [None]:
#import tsla stock price and filter the date
tsla_csv = Path("TSLA.csv")
tsla_df = pd.read_csv(tsla_csv, index_col="Date", parse_dates=True)
tsla_df.sort_index()
tsla_df['Ticker'] = 'TSLA'
tsla_df = tsla_df[['Ticker', 'Close']]
tsla_df.index = pd.to_datetime(tsla_df.index)
start_date = '2019-01-01 00:00:00'
end_date = '2022-12-31 23:59:59'
tsla_df = tsla_df[(tsla_df.index >= start_date) & (tsla_df.index <= end_date)]
tsla_df.head()

In [None]:
#import amzn stock price and filter the date
amzn_csv = Path("AMZN.csv")
amzn_df = pd.read_csv(amzn_csv, index_col="Date", parse_dates=True)
amzn_df.sort_index()
amzn_df['Ticker'] = 'AMZN'
amzn_df = amzn_df[['Ticker', 'Close']]
amzn_df.index = pd.to_datetime(amzn_df.index)
start_date = '2019-01-01 00:00:00'
end_date = '2022-12-31 23:59:59'
amzn_df = amzn_df[(amzn_df.index >= start_date) & (amzn_df.index <= end_date)]
amzn_df.head()

In [None]:
#import aapl stock price and filter the date
aapl_csv = Path("AAPL.csv")
aapl_df = pd.read_csv(aapl_csv, index_col="Date", parse_dates=True)
aapl_df.sort_index()
aapl_df['Ticker'] = 'AAPL'
aapl_df = aapl_df[['Ticker', 'Close']]
aapl_df.index = pd.to_datetime(aapl_df.index)
start_date = '2019-01-01 00:00:00'
end_date = '2022-12-31 23:59:59'
aapl_df = aapl_df[(aapl_df.index >= start_date) & (aapl_df.index <= end_date)]
aapl_df.head()