In [None]:
import pandas as pd
import numpy as np
import datetime as dt
from pathlib import Path

In [None]:
#import Company_Tweet.csv
Company_Tweet_df = pd.read_csv(Path("Company_Tweet.csv"))
Company_Tweet_df = Company_Tweet_df.set_index('tweet_id')
Company_Tweet_df.head()

In [None]:
#import Tweet.csv
tweet_df = pd.read_csv(Path("Tweet.csv"))
tweet_df = tweet_df.set_index('tweet_id')
tweet_df.head()

In [None]:
#merge to df into one
Company_tweet_result = tweet_df.merge(Company_Tweet_df, left_index=True, right_index=True)

In [None]:
#select relavent ticker and column.
selected_df = Company_tweet_result[Company_tweet_result['ticker_symbol'].isin(['AAPL', 'TSLA', 'TWTR'])]
selected_df = selected_df[['ticker_symbol', 'post_date','body']]
selected_df.sample()

In [None]:
#convert date into standard format
selected_df['post_date'] = pd.to_datetime(selected_df['post_date'], unit='s')
selected_df.sample()

In [None]:
# Use boolean indexing to filter the rows
start_date = '2019-01-01 00:00:00'
end_date = '2022-12-31 23:59:59'

stock_tweet_df = selected_df[(selected_df['post_date'] >= start_date) & (selected_df['post_date'] <= end_date)]

stock_tweet_df.head()

In [None]:
!pip install nltk


In [None]:
# Tokenize the body text
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

# Sample text
stock_tweet_df['tokenized_body'] = stock_tweet_df['body'].apply(word_tokenize)

# Tokenize
print(stock_tweet_df)

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

def remove_stopwords(tokens):
    return [word for word in tokens if word.lower() not in stopwords.words('english')]

stock_tweet_df['tokenized_body_no_stopwords'] = stock_tweet_df['tokenized_body'].apply(remove_stopwords)

print(stock_tweet_df)

In [None]:
#import aapl stock price and filter the date
aapl_csv = Path("AAPL.csv")
aapl_df = pd.read_csv(aapl_csv, index_col="Date", parse_dates=True)
aapl_df.sort_index()
aapl_df['Ticker'] = 'AAPL'
aapl_df = aapl_df[['Ticker', 'Close']]
aapl_df.index = pd.to_datetime(aapl_df.index)
start_date = '2019-01-01 00:00:00'
end_date = '2022-12-31 23:59:59'
aapl_df = aapl_df[(aapl_df.index >= start_date) & (aapl_df.index <= end_date)]
aapl_df.head()

In [None]:
#import tsla stock price and filter the date
tsla_csv = Path("TSLA.csv")
tsla_df = pd.read_csv(tsla_csv, index_col="Date", parse_dates=True)
tsla_df.sort_index()
tsla_df['Ticker'] = 'TSLA'
tsla_df = tsla_df[['Ticker', 'Close']]
tsla_df.index = pd.to_datetime(tsla_df.index)
start_date = '2019-01-01 00:00:00'
end_date = '2022-12-31 23:59:59'
tsla_df = tsla_df[(tsla_df.index >= start_date) & (tsla_df.index <= end_date)]
tsla_df.head()

In [1]:
#import twer stock price and filter the date
twer_csv = Path("TWER.csv")
twer_df = pd.read_csv(twer_csv, index_col="Date", parse_dates=True)
twer_df.sort_index()
twer_df['Ticker'] = 'TWER'
twer_df = tsla_df[['Ticker', 'Close']]
twer_df.index = pd.to_datetime(twer_df.index)
start_date = '2019-01-01 00:00:00'
end_date = '2022-12-31 23:59:59'
twer_df = twer_df[(twer_df.index >= start_date) & (twer_df.index <= end_date)]
twer_df.head()

NameError: name 'Path' is not defined

In [None]:
ethereum_tweets = pd.read_csv(
    Path("Ethereum_tweets.csv",
    index_col='date', 
    parse_dates=True, 
    infer_datetime_format=True
)).dropna()

ethereum_tweets.head()


In [None]:
ethereum_tweets = ethereum_tweets[["date", "text", "hashtags"]].set_index("date")
ethereum_tweets


In [None]:
#get rid of links and hashtags
ethereum_tweets["text"] = ethereum_tweets["text"].apply(lambda x : ' '.join([s for s in x.split(' ') if s.find('@') == -1 and s.find('www') == -1 and s.find('https') == -1]))

#get rid of non-ascii characters
ethereum_tweets = ethereum_tweets.replace(r'\W+', ' ', regex=True)


# get rid of tweet duplicates to not bias the sentiment analysis
ethereum_tweets = ethereum_tweets.drop_duplicates(subset=['text'])
ethereum_tweets

In [None]:
!pip install textblob

!pip install spacy
!python -m textblob.download_corpora
!python -m spacy download en_core_web_sm

In [None]:
# In this code, we're using SpaCy for tokenization and part-of-speech tagging, and TextBlob for sentiment analysis. We define a custom analyze_sentiment function that takes a text input, processes it with SpaCy and TextBlob, and returns the sentiment label and polarity score.

In [None]:
import spacy
from textblob import TextBlob

nlp = spacy.load('en_core_web_sm')

def analyze_sentiment(text):
    doc = nlp(text)
    blob = TextBlob(text)
    
    polarity = blob.sentiment.polarity
    
    if polarity > 0:
        sentiment = 'positive'
    elif polarity < 0:
        sentiment = 'negative'
    else:
        sentiment = 'neutral'
    
    return sentiment, polarity

# Assuming df is your DataFrame with 'text' column
ethereum_tweets['sentiment'], ethereum_tweets['polarity'] = zip(*ethereum_tweets['text'].apply(analyze_sentiment))

df_sentiment = ethereum_tweets.sort_values('polarity').reset_index(drop=True)
print(df_sentiment)


In [None]:
df_sentiment["sentiment"].value_counts()

In [None]:
positive_count = df_sentiment[df_sentiment['polarity'] > 0]['polarity'].count()
negative_count = df_sentiment[df_sentiment['polarity'] < 0]['polarity'].count()
neutral_count = df_sentiment[df_sentiment['polarity'] == 0]['polarity'].count()

print("Positive count:", positive_count)
print("Negative count:", negative_count)
print("Neutral count:", neutral_count)

In [None]:
# Before analyzing the content of the tweets, we are first going to preprocess our data even more. There are several preprocessing strategies we are going to:

#Lemmatize each word
#Delete extra characters
#Remove stop words

In [None]:
import spacy
from nltk.tokenize import RegexpTokenizer
import re
from nltk.corpus import stopwords

# Load the English language model
nlp = spacy.load("en_core_web_sm")

def preprocess(sentence, stemming=False, lemmatizing=False):
    global counter
    counter += 1
    if counter % 100 == 0:
        pass
        # print(counter)

    sentence = str(sentence)
    tokenizer = RegexpTokenizer(r'\w+')

    sentence = sentence.lower()
    sentence = sentence.replace('{html}', "")
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    rem_url = re.sub(r'http\S+', '', cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    tokens = tokenizer.tokenize(rem_num)

    filtered_words = [w for w in tokens if len(w) > 2 and w not in stopwords.words('english')]

    if lemmatizing:
        doc = nlp(" ".join(filtered_words))
        lemma_words = [token.lemma_ for token in doc if not token.is_punct and not token.is_space and not token.is_stop]
        return " ".join(lemma_words)

    if stemming:
        stemmer = PorterStemmer()
        stem_words = [stemmer.stem(w) for w in filtered_words]
        return " ".join(stem_words)

    return " ".join(filtered_words)

# Example usage
df_sentiment['text'] = df_sentiment['text'].apply(lambda x: preprocess(x, stemming=False, lemmatizing=True))
df_sentiment

In [None]:
# Let us extract the most common words found in both positive and negative positive reviews

In [None]:
df_neg = df_sentiment[df_sentiment['polarity'] < 0]
df_pos = df_sentiment[df_sentiment['polarity'] > 0]

In [None]:
# Amount of positive and negative reviews we have been inferring from our data,
# Let's a have general idea about the opinion of the public regarding Ethereum tweets:
print("Negative reviews", len(df_neg))
print("Positive reiews", len(df_pos))

In [None]:
from collections import Counter

# Extracting the most common words found in both positive and negative positive reviews:
positive_words = pd.DataFrame([dict(Counter(' '.join(df_pos['text'].values.tolist()).split(' ')))]).T.sort_values(0, ascending=False)[0:100].index

negative_words = pd.DataFrame([dict(Counter(' '.join(df_neg['text'].values.tolist()).split(' ')))]).T.sort_values(0, ascending=False)[0:100].index

In [None]:
display("Most common words in POSITIVE tweets on ETH:",positive_words)
display("Most common words in NEGATIVE tweets on ETH:",negative_words)