In [1]:
import pandas as pd
import re
import nltk
import numpy as np
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from datetime import datetime
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import multiprocessing

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tomma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\tomma\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [13]:
#Importing the dataframe
df = pd.read_csv("Bitcoin_from_2021-04-04_23-59-21_to_2021-04-04_00-00-00.csv")
df = df.set_index('Datetime')
df.index = pd.to_datetime(df.index)

In [14]:
### Data Cleaning definition
pat1 = r'@[A-Za-z0-9]+' # this is to remove any text with @....
pat2 = r'https?://[A-Za-z0-9./]+'  # this is to remove the urls
combined_pat = r'|'.join((pat1, pat2)) 
pat3 = r'[^a-zA-Z]' # to remove every other character except a-z & A-Z
combined_pat2 = r'|'.join((combined_pat,pat3)) # we combine pat1, pat2 and pat3 to pass it in the cleaning steps

In [15]:
### Starting the cleaning
ps = PorterStemmer()
cleaned_tweets = []
for i in range(0, len(df)) :
    if( (i+1)%10000 == 0 ):
        print("Tweets %d of %d has been processed" % ( i+1, len(df['Text']) ))
    tweets = re.sub(combined_pat2,' ',df['Text'][i])
    tweets = tweets.lower()
    tweets = tweets.split()
    tweets = [ps.stem(word) for word in tweets if not word in set(stopwords.words('english'))]
    tweets = ' '.join(tweets)
    cleaned_tweets.append(tweets)

In [16]:
# Instantiate the sentiment intensity analyzer
vader = SentimentIntensityAnalyzer()

In [17]:
#Compute the score
df['cleaned_tweets'] = np.array(cleaned_tweets)
scores = df['cleaned_tweets'].apply(vader.polarity_scores).tolist()

In [18]:
# Convert the 'scores' list of dicts into a DataFrame
scores_df = pd.DataFrame(scores, index=df.index)# Join the DataFrames of the news and the list of dicts
parsed_and_scored_news = df.join(scores_df)

In [19]:
#Create a dataset with only the compound senitment and the datetime
compact_df = pd.DataFrame({'compound': parsed_and_scored_news['compound']})

In [20]:
# Convert the timestamp column to datetime objects
extended_dates = pd.DataFrame({'timestamp': pd.to_datetime(compact_df.index, errors='coerce')})
extended_dates['timestamp'].fillna("2099-01-01 00:00:00", inplace=True)
dates = extended_dates['timestamp'].dt.date.apply(lambda x: x.strftime('%Y-%m-%d'))

In [21]:
#Create the final df by grouping for dates
compact_df = pd.DataFrame({'compound': compact_df.compound})
compact_df['date'] = np.array(dates)
final_df = compact_df.groupby(['date'])['compound'].sum()

In [22]:
#Exporting
start_day = final_df.index[0]
end_day = final_df.index[-1]
print(start_day, end_day)

2021-04-04 2021-04-04


In [12]:
final_df.to_csv('BTCTI'+ '_from_'+ start_day + '_to_'+ end_day + '.csv')