In [15]:
import pandas as pd
import datetime
import string
import nltk
import re

from textblob import TextBlob
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer


In [17]:
def tweet_sentiment_analysis(tweet):
    # removing punctuations
    eng_punct = string.punctuation
    def clean_punctuations(text):
        trans = str.maketrans('', '', eng_punct)
        return text.translate(trans)

    # removing URLs
    def cleaning_URLs(text):
        return re.sub('((www.[^s]+)|(https?://[^s]+))', ' ', text)

    # removing escape characters
    def clean_escape_char(text):
        return re.sub("\r\n", " ", text)

    # Appliyng tokenization
    # tokenizing involves splitting sentences and words from the body of the text.
    def tokenization(text):
        return (word_tokenize(text))

    # Removing stopwords
    # In natural language processing, useless words(data), are referred to as stop words.
    stopword = nltk.corpus.stopwords.words('english')
    def remove_stopwords(text):
        text = [word for word in text if word not in stopword]
        return text

    # Applying Stemmer
    # It provides the root of words. So you can eliminate words that come from the same root.
    ps = nltk.PorterStemmer()
    def stemming(text):
        text = [ps.stem(word) for word in text]
        return text

    # Applying Lemmatizer
    # the process of grouping together the different inflected forms of a word so they can be analyzed as a single item.
    lm = nltk.WordNetLemmatizer()
    def lemmatizing(data):
        text = [lm.lemmatize(word) for word in data]
        text = " ".join(text)
        return text

    tweets_db = pd.DataFrame([[tweet]])
    tweets_db.columns = ["originalTweets"]
    tweets_db["cleanedTweets"] = tweets_db.originalTweets.apply(
        lambda x: clean_punctuations(x))
    tweets_db["cleanedTweets"] = tweets_db.cleanedTweets.apply(
        lambda x: cleaning_URLs(x))
    tweets_db["cleanedTweets"] = tweets_db.cleanedTweets.apply(
        lambda x: clean_escape_char(x))
    tweets_db['cleanedTweets'] = tweets_db.cleanedTweets.apply(
        lambda x: tokenization(x))
    tweets_db['cleanedTweets'] = tweets_db.cleanedTweets.apply(
        lambda x: remove_stopwords(x))
    tweets_db['cleanedTweets'] = tweets_db.cleanedTweets.apply(
        lambda x: stemming(x))
    tweets_db['cleanedTweets'] = tweets_db.cleanedTweets.apply(
        lambda x: lemmatizing(x))

    # changing  the text to lowerCase
    tweets_db["cleanedTweets"] = tweets_db.cleanedTweets.str.lower()

    analysis = TextBlob(tweets_db.cleanedTweets[0])
    polarity = analysis.sentiment.polarity
    subjectivity = analysis.sentiment.subjectivity

    # Give a sentiment intensity score to sentences.
    score = SentimentIntensityAnalyzer().polarity_scores(
        tweets_db.cleanedTweets[0])
    neg = score['neg']
    neu = score['neu']
    pos = score['pos']
    comp = score['compound']

    if neg > pos:
        tweets_db['sentiment'] = "negative"
    elif pos > neg:
        tweets_db['sentiment'] = "positive"
    else:
        tweets_db['sentiment'] = "neutral"

    tweets_db['polarity'] = polarity
    tweets_db['subjectivity'] = subjectivity
    tweets_db['neg'] = neg
    tweets_db['neu'] = neu
    tweets_db['pos'] = pos
    tweets_db['compound'] = comp
    tweets_db['text_len'] = tweets_db['cleanedTweets'].astype(str).apply(len)
    tweets_db['text_word_count'] = tweets_db['cleanedTweets'].apply(
        lambda x: len(str(x).split()))

    return tweets_db


In [18]:
twitter_data = pd.read_csv("twitter_tweets_final_1.csv")
ADA_data = pd.read_csv("../preparedData/ADAUSDT.csv")

openTime = ADA_data["open_time"]
closeTime = ADA_data["close_time"]


In [19]:
# converted openTime and closeTime from
# dateFormat("%Y-%m-%d %H:%M:%S") to timestamp
openTime = [datetime.datetime.timestamp(
                datetime.datetime.strptime(
                    time, "%Y-%m-%d %H:%M:%S")
                ) for time in openTime
            ]

closeTime = [datetime.datetime.timestamp(
                datetime.datetime.strptime(
                    time, "%Y-%m-%d %H:%M:%S")
                ) for time in closeTime
            ]

# converted Timestamp column of twitter_data from
# dateFormat("%Y-%m-%dT%H:%M:%S.%fZ") to timestamp
twitter_data.Timestamp = [datetime.datetime.timestamp(
                datetime.datetime.strptime(
                    time, "%Y-%m-%dT%H:%M:%S.%fZ")
                ) for time in twitter_data.Timestamp
            ]

In [20]:
# calculating tweet_volume between the openTime and closeTime
for i in range(len(openTime)):
    tweets = twitter_data[
        (twitter_data.Timestamp > openTime[i])
        &
        (twitter_data.Timestamp < closeTime[i])
    ]
    tweet_count = len(tweets)
    
    polarity, subjectivity, neg, neu, pos, comp, tweet_len, tweet_word_count = 0, 0, 0, 0, 0, 0, 0, 0
    if(tweet_count > 0) :
        # for twitter sentiment_analysis
        for index, row in tweets["Text"].iteritems():
            sentiment_analysis = tweet_sentiment_analysis(row)
            polarity += sentiment_analysis.polarity
            subjectivity += sentiment_analysis.subjectivity

            neg += sentiment_analysis.neg
            neu += sentiment_analysis.neu
            pos += sentiment_analysis.pos
            comp += sentiment_analysis.compound

            tweet_len += sentiment_analysis.text_len
            tweet_word_count += sentiment_analysis.text_word_count
        
        # Adding new data in new columns of tweets in ADA_data
        ADA_data.loc[i, 'tweet_count'] = tweet_count
        ADA_data.loc[i, 'polarity'] = polarity[0]
        ADA_data.loc[i, 'subjectivity'] = subjectivity[0]
        ADA_data.loc[i, 'neg'] = neg[0]
        ADA_data.loc[i, 'neu'] = neu[0]
        ADA_data.loc[i, 'pos'] = pos[0]
        ADA_data.loc[i, 'compound'] = comp[0]
        ADA_data.loc[i, 'text_len'] = tweet_len[0]
        ADA_data.loc[i, 'text_word_count'] = tweet_word_count[0]
    else :
        # Adding new data in new columns of tweets in ADA_data
        ADA_data.loc[i, 'tweet_count'] = tweet_count
        ADA_data.loc[i, 'polarity'] = polarity
        ADA_data.loc[i, 'subjectivity'] = subjectivity
        ADA_data.loc[i, 'neg'] = neg
        ADA_data.loc[i, 'neu'] = neu
        ADA_data.loc[i, 'pos'] = pos
        ADA_data.loc[i, 'compound'] = comp
        ADA_data.loc[i, 'text_len'] = tweet_len
        ADA_data.loc[i, 'text_word_count'] = tweet_word_count
        

In [21]:
# save newly createddataFrame as .csv
ADA_data.to_csv(
    f"../preparedData/ADAUSDT_twitter_sentiment.csv", index=None, header=True)