In [1]:
import pandas as pd
import os
from nltk.stem import WordNetLemmatizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from pre_processing import clean
from pre_processing import token_stop_pos
from pre_processing import lemmatize

[nltk_data] Downloading package punkt to /Users/rupika/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/rupika/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rupika/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/rupika/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Create Lemma of a word/ POS_data

wordnet_lemmatizer = WordNetLemmatizer()
def lemmatize(pos_data):
    lemma_rew = " "
    for word, pos in pos_data:
        if not pos:
            lemma = word
            lemma_rew = lemma_rew + " " + lemma
        else:
            lemma = wordnet_lemmatizer.lemmatize(word, pos=pos)
            lemma_rew = lemma_rew + " " + lemma
    return lemma_rew

In [3]:
# Returns Vader Sentiment(compound, pos, neu, neg) Scores given the sentence.
analyzer = SentimentIntensityAnalyzer()
def get_vaderSentimentScores(title):
    vs = analyzer.polarity_scores(title) 
    return vs['compound'], vs['pos'], vs['neu'], vs['neg']

In [4]:
# Creates a new DataFrame with columns 'tweet_id','Vader Sentiment Score', 'pos', 'neu', 'neg' given a Dataframe with lemma words with tweet_id
def vadersentimentanalysis(df):
    vs_scores = []
    for index, row in df.iterrows():
        lemma = row['Lemma']
        tweet_id = row['tweet_id']
        result = [tweet_id]
        compound, pos, neu, neg  = get_vaderSentimentScores(lemma)
        result.append(compound)
        result.append(pos)
        result.append(neu)
        result.append(neg)
        vs_scores.append(result)
    score_df = pd.DataFrame(vs_scores, columns = ['tweet_id','Vader Sentiment Score', 'pos', 'neu', 'neg'])
    return score_df
    

In [5]:
# Setting Global variables
total_rows = 0

In [6]:
# Function to set Global Variable total_rows
def set_total_rows(rows):
    global total_rows
    total_rows = rows

In [7]:
# Function to fet Global Variable total_rows
def get_total_rows():
    return total_rows

In [8]:
# Function to get indexe range given number of rows by splitting them into rnage of 100000
def get_index_range():
    total_rows = get_total_rows()
    indexes = list(range(0,total_rows,100000))
    indexes.append(total_rows)
    return indexes

In [9]:
stock_tweets = pd.read_csv('./data/stocks_tweet.csv')
stock_tweets.head()

Unnamed: 0,tweet_id,ticker_symbol,date,writer,post_date,body,comment_num,retweet_num,like_num
0,550441509175443456,AAPL,2014-12-31,VisualStockRSRC,1420070457,"lx21 made $10,008 on $AAPL -Check it out! htt...",0,0,1
1,550441672312512512,AAPL,2014-12-31,KeralaGuy77,1420070496,Insanity of today weirdo massive selling. $aap...,0,0,0
2,550441732014223360,AMZN,2014-12-31,DozenStocks,1420070510,S&P100 #Stocks Performance $HD $LOW $SBUX $TGT...,0,0,0
3,550442977802207232,TSLA,2014-12-31,ShowDreamCar,1420070807,$GM $TSLA: Volkswagen Pushes 2014 Record Recal...,0,0,1
4,550443807834402816,AAPL,2014-12-31,i_Know_First,1420071005,Swing Trading: Up To 8.91% Return In 14 Days h...,0,0,1


In [10]:
# Cleans the tweets
stock_tweets['cleaned_tweet'] = stock_tweets['body'].apply(clean)
stock_tweets.head()

Unnamed: 0,tweet_id,ticker_symbol,date,writer,post_date,body,comment_num,retweet_num,like_num,cleaned_tweet
0,550441509175443456,AAPL,2014-12-31,VisualStockRSRC,1420070457,"lx21 made $10,008 on $AAPL -Check it out! htt...",0,0,1,lx made on AAPL Check it out http profit ly Mn...
1,550441672312512512,AAPL,2014-12-31,KeralaGuy77,1420070496,Insanity of today weirdo massive selling. $aap...,0,0,0,Insanity of today weirdo massive selling aapl ...
2,550441732014223360,AMZN,2014-12-31,DozenStocks,1420070510,S&P100 #Stocks Performance $HD $LOW $SBUX $TGT...,0,0,0,S P Stocks Performance HD LOW SBUX TGT DVN IBM...
3,550442977802207232,TSLA,2014-12-31,ShowDreamCar,1420070807,$GM $TSLA: Volkswagen Pushes 2014 Record Recal...,0,0,1,GM TSLA Volkswagen Pushes Record Recall Tally...
4,550443807834402816,AAPL,2014-12-31,i_Know_First,1420071005,Swing Trading: Up To 8.91% Return In 14 Days h...,0,0,1,Swing Trading Up To Return In Days http ow ly ...


In [11]:
stock_tweets['ticker_symbol'].value_counts()

AAPL     1425013
TSLA     1096868
AMZN      718715
GOOG      392569
MSFT      375711
GOOGL     327569
Name: ticker_symbol, dtype: int64

In [12]:
# Since the stock_tweets is huge file to make preprocessing easy
# we are splitting them into smaller files of size 100000 rows for a given company

def create_company_tweet_files(company_name):
    display(print('Inside the create_company_tweet_files'))
    display(print(f'Reading {company_name} tweets from stock_tweets'))
    company_tweet = stock_tweets.loc[stock_tweets['ticker_symbol'] == company_name]
    set_total_rows(company_tweet.shape[0])
    total_rows = get_total_rows()
    display(print(f'Total {company_name} tweets size is : {total_rows}'))
    indexes = get_index_range()
    display(print(f'Splitting {company_name} tweets as follows'))
    indexes
    for i in range(len(indexes)-1):
        display(print(f'inside the range {i} in {indexes[i]} - {indexes[i+1]}'))
        file_name = './data/' + company_name + '_tweet_' + str(i) + '.csv'
        company_tweet.iloc[indexes[i]:indexes[i+1]].to_csv(file_name)
        display(print(f'created filename : {file_name}'))

In [13]:
# Set the company_name for which the score files need to be created
# For our analysis we used 'AMZN' and 'TSLA'
company_name = 'AMZN'
# company_name = 'TSLA'

In [14]:
create_company_tweet_files(company_name)

Inside the create_company_tweet_files


None

Reading AMZN tweets from stock_tweets


None

Total AMZN tweets size is : 718715


None

Splitting AMZN tweets as follows


None

inside the range 0 in 0 - 100000


None

created filename : ./data/AMZN_tweet_0.csv


None

inside the range 1 in 100000 - 200000


None

created filename : ./data/AMZN_tweet_1.csv


None

inside the range 2 in 200000 - 300000


None

created filename : ./data/AMZN_tweet_2.csv


None

inside the range 3 in 300000 - 400000


None

created filename : ./data/AMZN_tweet_3.csv


None

inside the range 4 in 400000 - 500000


None

created filename : ./data/AMZN_tweet_4.csv


None

inside the range 5 in 500000 - 600000


None

created filename : ./data/AMZN_tweet_5.csv


None

inside the range 6 in 600000 - 700000


None

created filename : ./data/AMZN_tweet_6.csv


None

inside the range 7 in 700000 - 718715


None

created filename : ./data/AMZN_tweet_7.csv


None

In [15]:
total_rows

718715

In [16]:
folder_name = './data/'+ company_name
os.makedirs(folder_name, exist_ok = True)

In [None]:
t = get_total_rows()
t

718715

In [None]:
indexes = get_index_range()
indexes

[0, 100000, 200000, 300000, 400000, 500000, 600000, 700000, 718715]

In this section we do the following : 
 1) Read the smaller file
 2) Call `token_stop_pos` on `cleaned_tweet` which tokenizes, removes stop words and gives POS(Parts of Speech).
 3) Call `lemmatize` on `POS tagged` which gives lemma of the column.
 4) Call `vadersentimentanalysis` which returns vader scores for the `Lemma`
 5) Writing the final DataFrame into a csv file to its designated folder

In [19]:

for i in range(len(indexes)-1):
    file_name = './data/' + company_name + '_tweet_' + str(i) + '.csv'
    display(print(f'Reading filename : {file_name}'))
    company_df = pd.read_csv(file_name)
    company_df = company_df.drop(columns=['Unnamed: 0'])
    display(print(f'POS tagging started'))
    company_df['POS tagged'] = company_df['cleaned_tweet'].apply(token_stop_pos)
    display(print(f'POS tagging ended'))
    display(print(f'Lemma started'))
    company_df['Lemma'] = company_df['POS tagged'].apply(lemmatize)
    display(print(f'Lemma ended'))
    display(print(f'Sentiment score started'))
    t_df = company_df[['tweet_id','Lemma']]
    vs_df = vadersentimentanalysis(t_df)
    company_df = company_df.set_index('tweet_id').join(vs_df.set_index('tweet_id'))
    display(print(f'Sentiment score ended'))
    folder_name = './data/'+ company_name
    os.makedirs(folder_name, exist_ok = True)
    final_file_name = folder_name + '/final_'+company_name+'_tweet_' + str(i) + '.csv'
    company_df.to_csv(final_file_name)
    display(print(f'Final file created : {final_file_name}'))    

Reading filename : ./data/AMZN_tweet_0.csv


None

POS tagging started


None

POS tagging ended


None

Lemma started


None

Lemma ended


None

Sentiment score started


None

Sentiment score ended


None

Final file created : ./data/AMZN/final_AMZN_tweet_0.csv


None

Reading filename : ./data/AMZN_tweet_1.csv


None

POS tagging started


None

POS tagging ended


None

Lemma started


None

Lemma ended


None

Sentiment score started


None

Sentiment score ended


None

Final file created : ./data/AMZN/final_AMZN_tweet_1.csv


None

Reading filename : ./data/AMZN_tweet_2.csv


None

POS tagging started


None

POS tagging ended


None

Lemma started


None

Lemma ended


None

Sentiment score started


None

Sentiment score ended


None

Final file created : ./data/AMZN/final_AMZN_tweet_2.csv


None

Reading filename : ./data/AMZN_tweet_3.csv


None

POS tagging started


None

POS tagging ended


None

Lemma started


None

Lemma ended


None

Sentiment score started


None

Sentiment score ended


None

Final file created : ./data/AMZN/final_AMZN_tweet_3.csv


None

Reading filename : ./data/AMZN_tweet_4.csv


None

POS tagging started


None

POS tagging ended


None

Lemma started


None

Lemma ended


None

Sentiment score started


None

Sentiment score ended


None

Final file created : ./data/AMZN/final_AMZN_tweet_4.csv


None

Reading filename : ./data/AMZN_tweet_5.csv


None

POS tagging started


None

POS tagging ended


None

Lemma started


None

Lemma ended


None

Sentiment score started


None

Sentiment score ended


None

Final file created : ./data/AMZN/final_AMZN_tweet_5.csv


None

Reading filename : ./data/AMZN_tweet_6.csv


None

POS tagging started


None

POS tagging ended


None

Lemma started


None

Lemma ended


None

Sentiment score started


None

Sentiment score ended


None

Final file created : ./data/AMZN/final_AMZN_tweet_6.csv


None

Reading filename : ./data/AMZN_tweet_7.csv


None

POS tagging started


None

POS tagging ended


None

Lemma started


None

Lemma ended


None

Sentiment score started


None

Sentiment score ended


None

Final file created : ./data/AMZN/final_AMZN_tweet_7.csv


None

In [21]:
small_dfs = []
file_range = range(len(indexes)-1)

In [22]:
file_range

range(0, 8)

Reading the smaller files created above and combining it to single dataframe

In [24]:
for i in file_range:
    final_file_name = folder_name + '/final_'+company_name+'_tweet_' + str(i) + '.csv'
    print(f'started filename : {final_file_name}')
    df = pd.read_csv(final_file_name , infer_datetime_format=True, parse_dates=True)
    small_dfs.append(df)

started filename : ./data/AMZN/final_AMZN_tweet_0.csv
started filename : ./data/AMZN/final_AMZN_tweet_1.csv
started filename : ./data/AMZN/final_AMZN_tweet_2.csv
started filename : ./data/AMZN/final_AMZN_tweet_3.csv
started filename : ./data/AMZN/final_AMZN_tweet_4.csv
started filename : ./data/AMZN/final_AMZN_tweet_5.csv
started filename : ./data/AMZN/final_AMZN_tweet_6.csv
started filename : ./data/AMZN/final_AMZN_tweet_7.csv


In [25]:
final_company_df = pd.concat(small_dfs, ignore_index=True)

In [26]:
final_company_df.dtypes

tweet_id                   int64
ticker_symbol             object
date                      object
writer                    object
post_date                  int64
body                      object
comment_num                int64
retweet_num                int64
like_num                   int64
cleaned_tweet             object
POS tagged                object
Lemma                     object
Vader Sentiment Score    float64
pos                      float64
neu                      float64
neg                      float64
dtype: object

In [27]:
final_company_df['date'] = pd.to_datetime(final_company_df['date'])
final_company_df.dtypes

tweet_id                          int64
ticker_symbol                    object
date                     datetime64[ns]
writer                           object
post_date                         int64
body                             object
comment_num                       int64
retweet_num                       int64
like_num                          int64
cleaned_tweet                    object
POS tagged                       object
Lemma                            object
Vader Sentiment Score           float64
pos                             float64
neu                             float64
neg                             float64
dtype: object

Grouping the tweets by Date and taking mean as aggregate function

In [28]:
final_company_tw_df = final_company_df[['date','Vader Sentiment Score','pos','neu','neg']]

In [29]:
final_company_tw_df = final_company_tw_df.groupby([final_company_tw_df['date'].dt.date]).mean()

In [30]:
final_company_tw_df.shape

(1827, 4)

In [32]:
col1_name = company_name + '_TWT_VaderSentimentScore'
col2_name = company_name + '_TWT_pos'
col3_name = company_name + '_TWT_neu'
col4_name = company_name + '_TWT_neg'

final_company_tw_df = final_company_tw_df.set_axis(['col1_name', 'col2_name', 'col3_name', 'col4_name'], axis=1, inplace=False)



Writing the final grpuped DataFrame to a final `Twitter_Scores.csv` file for each company

In [36]:
final_company_tw_file_name = './data/'+ company_name + '_Twitter_Scores.csv'


In [37]:
final_company_tw_df.to_csv(final_company_tw_file_name)