In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Imports

In [2]:
from twitterscraper import query_tweets
import datetime as dt 
import pandas as pd
from tqdm import tqdm

import matplotlib.pyplot as plt 
import seaborn as sns

from textblob import TextBlob
import re
import string
import unicodedata
import contractions
import inflect

import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import LancasterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize

import warnings
warnings.filterwarnings('ignore')

from string import punctuation 
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn import preprocessing

try:
    from PIL import Image
except ImportError:
    import Image
import pytesseract
import urllib

import time

from nltk.sentiment.vader import SentimentIntensityAnalyzer

import project_lib
import multiprocessing
from multiprocessing import Pool
import os

import unicodecsv as csv

INFO: {'User-Agent': 'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16'}


# Functions (also available in project_lib.py)
Define functions to request Twitter data, clean it and extract sentiment out of it.
The project_lib.py is meant for the multiprocessing on windows.

In [None]:
def request_twitter(company, start_date, end_date):
    query = company + ' -filter:retweets -filter:replies'
    delta = dt.timedelta(days=7)
    twts_raw = pd.DataFrame()
    twts_df = pd.DataFrame()
    EndDT = start_date + delta
    BeginDT = start_date
    while EndDT <= EndDate:
    
        tweets = query_tweets(query, limit=None, begindate=BeginDT,
                              enddate=EndDT, lang='en')
        tweets_df = pd.DataFrame(t.__dict__ for t in tweets)
        twts_df = twts_df.append(tweets_df)
    
    
        if (EndDate - EndDT) > delta:
            BeginDT = BeginDT + delta
            EndDT = EndDT + delta
        else:
            BeginDT = BeginDT + delta
            EndDT = EndDate
            if BeginDT > EndDT:
                break
    
    twts_raw = twts_df        
    twts_df = twts_df.drop_duplicates(subset=['tweet_id'])
    twts_df = twts_df.set_index('tweet_id')
    twts_df = twts_df.sort_index()
    
    return twts_df, twts_raw

''' twitterscraper allows to retrieve historical twitter data without using the
    Twitter API thus no need to have a premium account '''
    
''' begindate starts at date '''
''' enddate ends at date-1 '''


def get_text_from_image(url):
    local_filename, headers = urllib.request.urlretrieve(url)
    image_text = pytesseract.image_to_string(local_filename)
    print('Text: ' + image_text + ' retrieve from image.')
    return image_text

def retrieve_attachement_data(tweet):
    for match in re.finditer(r'((www\.[^\s]+)|(https?://[^\s]+))', tweet):
        url = tweet[match.start():match.end()]
        # use try in order to catch errors due to bad links
        try:
            # Retrieve link info
            #print('Testing url: ' + url)
            u = urllib.request.urlopen(url)
            result = u.info()
            
            # if the format is the one desired process it else skip it
            content_type = result.get_content_type()
            if len(content_type) < 5:
                print('When retrieving url ' + url + ' found content type ' + result + " that will not be analysed.")
            
            elif content_type[:5] == 'image': # match image/gif,image/jpeg,image/png
                tweet = tweet + ' ' + get_text_from_image(url)
            
            #elif content_type == ('application/pdf'):
                #tweet = tweet + ' ' + get_text_from_pdf(url)
                
        except urllib.error.HTTPError as e:
            print('An HTTPError has been raised, code ' + str(e.code) + '. Passing to the next step.')
        except urllib.error.URLError as e:
            print('An URLError has been raised, reason ' + str(e.reason) + '. Passing to the next step.')
        except ConnectionResetError:
            print('A ConnectionResetError has been raised. Passing to the next step.')

    return tweet

def download_attachement(tweet):
    
    return tweet

def remove_url(tweet):
    tweet = re.sub(r'((www\.[^\s]+)|(https?://[^\s]+))', '', tweet)
    return tweet

def remove_user(tweet):
    tweet = re.sub('@[^\s]+', '', tweet)
    return tweet

def remove_hashtags(tweet):
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    return tweet

def contract_words(tweet):
    tweet = contractions.fix(tweet)
    return tweet

def remove_non_ascii(tweet):
    tweet = unicodedata.normalize('NFKD', tweet).encode('ascii', 'ignore').\
    decode('utf-8', 'ignore')
    return tweet

def remove_characters(tweet):
    tweet = re.sub(r'[^a-zA-Z0-9]', ' ', tweet)
    return tweet

def lowercase(tweet):
    tweet = tweet.lower()
    return tweet

def remove_punctuation(tweet):
    tweet =re.sub(r'[^\w\s]', '', tweet)
    return tweet

def replace_numbers(tweet):
    p = inflect.engine()
    new_words = []
    for word in tweet.split():
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return ' '.join(new_words)

def remove_stopwords(tweet):
    clean_mess = [word for word in tweet.split() if word.lower() not in stopwords.words('english')]
    clean_tweet = ' '.join(clean_mess)
    return clean_tweet

def normalization(tweet):
    lem = WordNetLemmatizer()
    normalized_tweet = []
    for word in tweet.split():
        normalized_text = lem.lemmatize(word, pos='v')
        normalized_tweet.append(normalized_text)
    return ' '.join(normalized_tweet)

def tweet_normalized(tweet):
    tweet = remove_url(tweet)
    tweet = remove_user(tweet)
    tweet = remove_hashtags(tweet)
    tweet = contract_words(tweet)
    tweet = remove_non_ascii(tweet)
    tweet = remove_characters(tweet)
    tweet = lowercase(tweet)
    tweet = remove_punctuation(tweet)
    tweet = replace_numbers(tweet)
    tweet = remove_stopwords(tweet)
    tweet = normalization(tweet)
    return tweet


def oldmultiprocess_execution(timestamp,img_urls,links,replies,retweets,text,tweet_id,comp):
    try:
        '''Initialize Sentiment Analyzer'''
        sid = SentimentIntensityAnalyzer()
        tweet_text = text
        if type(img_urls) is str:
            tweet_text = str(img_urls).replace('[\'','').replace('\']',' ') + ' ' + tweet_text
        if type(links) is str:
            tweet_text = str(links).replace('[\'','').replace('\']',' ') + ' ' + tweet_text
        #tweet_text = retrieve_attachement_data(tweet_text)
        tweet_text = tweet_normalized(tweet_text)
        polarity = sid.polarity_scores(tweet_text)
        returned_data = {'tweet_id':tweet_id, 'timestamp':timestamp, 'replies':replies, 'retweets':retweets,'clean_tweet':tweet_text,'Comp':polarity['compound'],'Negative':polarity['neg'],'Neutral':polarity['neu'],'Positive':polarity['pos']}
    except:
        print('An error ocurred, skipping the line ' + str(tweet_id) + ' on timestamp ' + str(timestamp))
        returned_data = {}
    return returned_data

# Request Twitter Data

Loop to request Twitter data of "names" in list companies. The loop will save data in 2 csv files: one csv with RAW data and the other: sorted, with removed duplicates and data in first column

In [4]:
start = time.time()
#%%

''' REQUEST TWITTER DATA ''' 

'''LOOP TO REQUEST TWITTER DATA OF "NAMES" IN LIST companies. LOOP WILL SAVE DATA 
    IN 2 CSV FILES: ONE CSV WITH RAW DATA AND THE OTHER: SORTED, REMOVING 
    DUPLICATES, AND DATA IN FIRST COLUMN'''

###############################################################################
###############################################################################
''' data is taken as of BeginDate until (EndDate - 1) '''
BeginDate = dt.date(2017,1,1) 
EndDate = dt.date(2020,4,1)
'''companies = ['Microsoft', 'Apple', 'Mastercard','Intel Corp', 
             'Cisco Systems', 'Adobe', 'Nvidia',
             'Salesforce', 'PayPal', 'Oracle', '#SP500']'''
companies = ['Paypal']
###############################################################################
###############################################################################
             
start_year = BeginDate.strftime('%Y%m%d')
end_year = EndDate.strftime('%Y%m%d')

for company in companies:
    temp_df = pd.DataFrame()
    temp_raw = []
    temp_df, temp_raw = request_twitter(company, BeginDate, EndDate)
    filename_df = company+'_'+start_year+'_'+end_year+'.csv'
    filename_raw = company+'_raw_'+start_year+'_'+end_year+'.csv'
    temp_df = temp_df.replace(r'\n',' ', regex=True)
    temp_raw = temp_raw.replace(r'\n',' ', regex=True)
    temp_df.to_csv(filename_df, sep = "|", header=True, index=True)
    temp_raw.to_csv(filename_raw, sep = "|",header=True,index=False)

elapsed = time.time() - start
print(f"Elapsed time: {elapsed}")

INFO: queries: ['Paypal -filter:retweets -filter:replies since:2017-01-01 until:2017-01-02', 'Paypal -filter:retweets -filter:replies since:2017-01-02 until:2017-01-03', 'Paypal -filter:retweets -filter:replies since:2017-01-03 until:2017-01-04', 'Paypal -filter:retweets -filter:replies since:2017-01-04 until:2017-01-05', 'Paypal -filter:retweets -filter:replies since:2017-01-05 until:2017-01-06', 'Paypal -filter:retweets -filter:replies since:2017-01-06 until:2017-01-07', 'Paypal -filter:retweets -filter:replies since:2017-01-07 until:2017-01-08']
INFO: Got 2108 tweets (2108 new).
INFO: Got 4555 tweets (2447 new).
INFO: Got 6917 tweets (2362 new).
INFO: Got 9260 tweets (2343 new).
INFO: Got 11545 tweets (2285 new).
INFO: Got 13910 tweets (2365 new).
INFO: Got 16248 tweets (2338 new).
INFO: queries: ['Paypal -filter:retweets -filter:replies since:2017-01-08 until:2017-01-09', 'Paypal -filter:retweets -filter:replies since:2017-01-09 until:2017-01-10', 'Paypal -filter:retweets -filter:r

INFO: Got 2628 tweets (2628 new).
INFO: Got 5231 tweets (2603 new).
INFO: Got 8317 tweets (3086 new).
INFO: Got 11315 tweets (2998 new).
INFO: Got 14418 tweets (3103 new).
INFO: Got 17471 tweets (3053 new).
INFO: Got 20474 tweets (3003 new).
INFO: queries: ['Paypal -filter:retweets -filter:replies since:2017-03-19 until:2017-03-20', 'Paypal -filter:retweets -filter:replies since:2017-03-20 until:2017-03-21', 'Paypal -filter:retweets -filter:replies since:2017-03-21 until:2017-03-22', 'Paypal -filter:retweets -filter:replies since:2017-03-22 until:2017-03-23', 'Paypal -filter:retweets -filter:replies since:2017-03-23 until:2017-03-24', 'Paypal -filter:retweets -filter:replies since:2017-03-24 until:2017-03-25', 'Paypal -filter:retweets -filter:replies since:2017-03-25 until:2017-03-26']
INFO: Got 2487 tweets (2487 new).
INFO: Got 4984 tweets (2497 new).
INFO: Got 7856 tweets (2872 new).
INFO: Got 10524 tweets (2668 new).
INFO: Got 13359 tweets (2835 new).
INFO: Got 15832 tweets (2473 ne

INFO: Got 20 tweets (20 new).
INFO: Got 40 tweets (20 new).
INFO: Got 60 tweets (20 new).
INFO: Got 80 tweets (20 new).
INFO: Got 100 tweets (20 new).
INFO: Got 120 tweets (20 new).
INFO: Got 140 tweets (20 new).
INFO: queries: ['Paypal -filter:retweets -filter:replies since:2017-06-04 until:2017-06-05', 'Paypal -filter:retweets -filter:replies since:2017-06-05 until:2017-06-06', 'Paypal -filter:retweets -filter:replies since:2017-06-06 until:2017-06-07', 'Paypal -filter:retweets -filter:replies since:2017-06-07 until:2017-06-08', 'Paypal -filter:retweets -filter:replies since:2017-06-08 until:2017-06-09', 'Paypal -filter:retweets -filter:replies since:2017-06-09 until:2017-06-10', 'Paypal -filter:retweets -filter:replies since:2017-06-10 until:2017-06-11']
INFO: Got 20 tweets (20 new).
INFO: Got 40 tweets (20 new).
INFO: Got 60 tweets (20 new).
INFO: Got 80 tweets (20 new).
INFO: Got 99 tweets (19 new).
INFO: Got 119 tweets (20 new).
INFO: Got 139 tweets (20 new).
INFO: queries: ['Pay

INFO: Got 20 tweets (20 new).
INFO: Got 37 tweets (17 new).
INFO: Got 57 tweets (20 new).
INFO: Got 77 tweets (20 new).
INFO: Got 96 tweets (19 new).
INFO: Got 116 tweets (20 new).
INFO: Got 136 tweets (20 new).
INFO: queries: ['Paypal -filter:retweets -filter:replies since:2017-08-20 until:2017-08-21', 'Paypal -filter:retweets -filter:replies since:2017-08-21 until:2017-08-22', 'Paypal -filter:retweets -filter:replies since:2017-08-22 until:2017-08-23', 'Paypal -filter:retweets -filter:replies since:2017-08-23 until:2017-08-24', 'Paypal -filter:retweets -filter:replies since:2017-08-24 until:2017-08-25', 'Paypal -filter:retweets -filter:replies since:2017-08-25 until:2017-08-26', 'Paypal -filter:retweets -filter:replies since:2017-08-26 until:2017-08-27']
INFO: Got 19 tweets (19 new).
INFO: Got 39 tweets (20 new).
INFO: Got 59 tweets (20 new).
INFO: Got 79 tweets (20 new).
INFO: Got 98 tweets (19 new).
INFO: Got 118 tweets (20 new).
INFO: Got 138 tweets (20 new).
INFO: queries: ['Payp

INFO: Got 2273 tweets (2273 new).
INFO: Got 4859 tweets (2586 new).
INFO: Got 7495 tweets (2636 new).
INFO: Got 10055 tweets (2560 new).
INFO: Got 12577 tweets (2522 new).
INFO: Got 15122 tweets (2545 new).
INFO: Got 17550 tweets (2428 new).
INFO: queries: ['Paypal -filter:retweets -filter:replies since:2017-11-05 until:2017-11-06', 'Paypal -filter:retweets -filter:replies since:2017-11-06 until:2017-11-07', 'Paypal -filter:retweets -filter:replies since:2017-11-07 until:2017-11-08', 'Paypal -filter:retweets -filter:replies since:2017-11-08 until:2017-11-09', 'Paypal -filter:retweets -filter:replies since:2017-11-09 until:2017-11-10', 'Paypal -filter:retweets -filter:replies since:2017-11-10 until:2017-11-11', 'Paypal -filter:retweets -filter:replies since:2017-11-11 until:2017-11-12']
INFO: Got 1606 tweets (1606 new).
INFO: Got 3074 tweets (1468 new).
INFO: Got 4564 tweets (1490 new).
INFO: Got 6161 tweets (1597 new).
INFO: Got 7813 tweets (1652 new).
INFO: Got 9268 tweets (1455 new).

INFO: Got 20 tweets (20 new).
INFO: Got 40 tweets (20 new).
INFO: Got 59 tweets (19 new).
INFO: Got 79 tweets (20 new).
INFO: Got 96 tweets (17 new).
INFO: Got 116 tweets (20 new).
INFO: Got 136 tweets (20 new).
INFO: queries: ['Paypal -filter:retweets -filter:replies since:2018-01-21 until:2018-01-22', 'Paypal -filter:retweets -filter:replies since:2018-01-22 until:2018-01-23', 'Paypal -filter:retweets -filter:replies since:2018-01-23 until:2018-01-24', 'Paypal -filter:retweets -filter:replies since:2018-01-24 until:2018-01-25', 'Paypal -filter:retweets -filter:replies since:2018-01-25 until:2018-01-26', 'Paypal -filter:retweets -filter:replies since:2018-01-26 until:2018-01-27', 'Paypal -filter:retweets -filter:replies since:2018-01-27 until:2018-01-28']
INFO: Got 18 tweets (18 new).
INFO: Got 38 tweets (20 new).
INFO: Got 54 tweets (16 new).
INFO: Got 74 tweets (20 new).
INFO: Got 94 tweets (20 new).
INFO: Got 114 tweets (20 new).
INFO: Got 134 tweets (20 new).
INFO: queries: ['Payp

INFO: Got 2106 tweets (2106 new).
INFO: Got 4307 tweets (2201 new).
INFO: Got 6100 tweets (1793 new).
INFO: Got 7913 tweets (1813 new).
INFO: Got 10136 tweets (2223 new).
INFO: Got 12134 tweets (1998 new).
INFO: Got 14213 tweets (2079 new).
INFO: queries: ['Paypal -filter:retweets -filter:replies since:2018-04-08 until:2018-04-09', 'Paypal -filter:retweets -filter:replies since:2018-04-09 until:2018-04-10', 'Paypal -filter:retweets -filter:replies since:2018-04-10 until:2018-04-11', 'Paypal -filter:retweets -filter:replies since:2018-04-11 until:2018-04-12', 'Paypal -filter:retweets -filter:replies since:2018-04-12 until:2018-04-13', 'Paypal -filter:retweets -filter:replies since:2018-04-13 until:2018-04-14', 'Paypal -filter:retweets -filter:replies since:2018-04-14 until:2018-04-15']
INFO: Got 20 tweets (20 new).
INFO: Got 37 tweets (17 new).
INFO: Got 56 tweets (19 new).
INFO: Got 76 tweets (20 new).
INFO: Got 95 tweets (19 new).
INFO: Got 115 tweets (20 new).
INFO: Got 133 tweets (1

INFO: Got 628 tweets (628 new).
INFO: Got 1162 tweets (534 new).
INFO: Got 1732 tweets (570 new).
INFO: Got 2318 tweets (586 new).
INFO: Got 2943 tweets (625 new).
INFO: Got 3535 tweets (592 new).
INFO: Got 4120 tweets (585 new).
INFO: queries: ['Paypal -filter:retweets -filter:replies since:2018-06-24 until:2018-06-25', 'Paypal -filter:retweets -filter:replies since:2018-06-25 until:2018-06-26', 'Paypal -filter:retweets -filter:replies since:2018-06-26 until:2018-06-27', 'Paypal -filter:retweets -filter:replies since:2018-06-27 until:2018-06-28', 'Paypal -filter:retweets -filter:replies since:2018-06-28 until:2018-06-29', 'Paypal -filter:retweets -filter:replies since:2018-06-29 until:2018-06-30', 'Paypal -filter:retweets -filter:replies since:2018-06-30 until:2018-07-01']
INFO: Got 20 tweets (20 new).
INFO: Got 40 tweets (20 new).
INFO: Got 58 tweets (18 new).
INFO: Got 78 tweets (20 new).
INFO: Got 97 tweets (19 new).
INFO: Got 116 tweets (19 new).
INFO: Got 136 tweets (20 new).
INF

INFO: Got 20 tweets (20 new).
INFO: Got 39 tweets (19 new).
INFO: Got 59 tweets (20 new).
INFO: Got 79 tweets (20 new).
INFO: Got 99 tweets (20 new).
INFO: Got 118 tweets (19 new).
INFO: Got 138 tweets (20 new).
INFO: queries: ['Paypal -filter:retweets -filter:replies since:2018-09-09 until:2018-09-10', 'Paypal -filter:retweets -filter:replies since:2018-09-10 until:2018-09-11', 'Paypal -filter:retweets -filter:replies since:2018-09-11 until:2018-09-12', 'Paypal -filter:retweets -filter:replies since:2018-09-12 until:2018-09-13', 'Paypal -filter:retweets -filter:replies since:2018-09-13 until:2018-09-14', 'Paypal -filter:retweets -filter:replies since:2018-09-14 until:2018-09-15', 'Paypal -filter:retweets -filter:replies since:2018-09-15 until:2018-09-16']
INFO: Got 20 tweets (20 new).
INFO: Got 40 tweets (20 new).
INFO: Got 59 tweets (19 new).
INFO: Got 76 tweets (17 new).
INFO: Got 96 tweets (20 new).
INFO: Got 115 tweets (19 new).
INFO: Got 135 tweets (20 new).
INFO: queries: ['Payp

INFO: Got 20 tweets (20 new).
INFO: Got 40 tweets (20 new).
INFO: Got 60 tweets (20 new).
INFO: Got 80 tweets (20 new).
INFO: Got 99 tweets (19 new).
INFO: Got 119 tweets (20 new).
INFO: Got 139 tweets (20 new).
INFO: queries: ['Paypal -filter:retweets -filter:replies since:2018-11-25 until:2018-11-26', 'Paypal -filter:retweets -filter:replies since:2018-11-26 until:2018-11-27', 'Paypal -filter:retweets -filter:replies since:2018-11-27 until:2018-11-28', 'Paypal -filter:retweets -filter:replies since:2018-11-28 until:2018-11-29', 'Paypal -filter:retweets -filter:replies since:2018-11-29 until:2018-11-30', 'Paypal -filter:retweets -filter:replies since:2018-11-30 until:2018-12-01', 'Paypal -filter:retweets -filter:replies since:2018-12-01 until:2018-12-02']
INFO: Got 20 tweets (20 new).
INFO: Got 40 tweets (20 new).
INFO: Got 59 tweets (19 new).
INFO: Got 69 tweets (10 new).
INFO: Got 88 tweets (19 new).
INFO: Got 108 tweets (20 new).
INFO: Got 168 tweets (60 new).
INFO: queries: ['Payp

INFO: Got 18 tweets (18 new).
INFO: Got 37 tweets (19 new).
INFO: Got 54 tweets (17 new).
INFO: Got 74 tweets (20 new).
INFO: Got 94 tweets (20 new).
INFO: Got 112 tweets (18 new).
INFO: Got 132 tweets (20 new).
INFO: queries: ['Paypal -filter:retweets -filter:replies since:2019-02-10 until:2019-02-11', 'Paypal -filter:retweets -filter:replies since:2019-02-11 until:2019-02-12', 'Paypal -filter:retweets -filter:replies since:2019-02-12 until:2019-02-13', 'Paypal -filter:retweets -filter:replies since:2019-02-13 until:2019-02-14', 'Paypal -filter:retweets -filter:replies since:2019-02-14 until:2019-02-15', 'Paypal -filter:retweets -filter:replies since:2019-02-15 until:2019-02-16', 'Paypal -filter:retweets -filter:replies since:2019-02-16 until:2019-02-17']
INFO: Got 11 tweets (11 new).
INFO: Got 31 tweets (20 new).
INFO: Got 47 tweets (16 new).
INFO: Got 67 tweets (20 new).
INFO: Got 87 tweets (20 new).
INFO: Got 107 tweets (20 new).
INFO: Got 126 tweets (19 new).
INFO: queries: ['Payp

INFO: Got 1778 tweets (1778 new).
INFO: Got 3327 tweets (1549 new).
INFO: Got 5120 tweets (1793 new).
INFO: Got 6731 tweets (1611 new).
INFO: Got 8275 tweets (1544 new).
INFO: Got 10104 tweets (1829 new).
INFO: Got 11657 tweets (1553 new).
INFO: queries: ['Paypal -filter:retweets -filter:replies since:2019-04-28 until:2019-04-29', 'Paypal -filter:retweets -filter:replies since:2019-04-29 until:2019-04-30', 'Paypal -filter:retweets -filter:replies since:2019-04-30 until:2019-05-01', 'Paypal -filter:retweets -filter:replies since:2019-05-01 until:2019-05-02', 'Paypal -filter:retweets -filter:replies since:2019-05-02 until:2019-05-03', 'Paypal -filter:retweets -filter:replies since:2019-05-03 until:2019-05-04', 'Paypal -filter:retweets -filter:replies since:2019-05-04 until:2019-05-05']
INFO: Got 19 tweets (19 new).
INFO: Got 35 tweets (16 new).
INFO: Got 49 tweets (14 new).
INFO: Got 67 tweets (18 new).
INFO: Got 99 tweets (32 new).
INFO: Got 139 tweets (40 new).
INFO: Got 167 tweets (28

INFO: Got 2084 tweets (2084 new).
INFO: Got 3925 tweets (1841 new).
INFO: Got 6282 tweets (2357 new).
INFO: Got 8634 tweets (2352 new).
INFO: Got 11107 tweets (2473 new).
INFO: Got 13197 tweets (2090 new).
INFO: Got 15676 tweets (2479 new).
INFO: queries: ['Paypal -filter:retweets -filter:replies since:2019-07-14 until:2019-07-15', 'Paypal -filter:retweets -filter:replies since:2019-07-15 until:2019-07-16', 'Paypal -filter:retweets -filter:replies since:2019-07-16 until:2019-07-17', 'Paypal -filter:retweets -filter:replies since:2019-07-17 until:2019-07-18', 'Paypal -filter:retweets -filter:replies since:2019-07-18 until:2019-07-19', 'Paypal -filter:retweets -filter:replies since:2019-07-19 until:2019-07-20', 'Paypal -filter:retweets -filter:replies since:2019-07-20 until:2019-07-21']
INFO: Got 51 tweets (51 new).
INFO: Got 135 tweets (84 new).
INFO: Got 239 tweets (104 new).
INFO: Got 341 tweets (102 new).
INFO: Got 447 tweets (106 new).
INFO: Got 549 tweets (102 new).
INFO: Got 658 t

INFO: Got 16 tweets (16 new).
INFO: Got 32 tweets (16 new).
INFO: Got 52 tweets (20 new).
INFO: Got 70 tweets (18 new).
INFO: Got 86 tweets (16 new).
INFO: Got 102 tweets (16 new).
INFO: Got 117 tweets (15 new).
INFO: queries: ['Paypal -filter:retweets -filter:replies since:2019-09-29 until:2019-09-30', 'Paypal -filter:retweets -filter:replies since:2019-09-30 until:2019-10-01', 'Paypal -filter:retweets -filter:replies since:2019-10-01 until:2019-10-02', 'Paypal -filter:retweets -filter:replies since:2019-10-02 until:2019-10-03', 'Paypal -filter:retweets -filter:replies since:2019-10-03 until:2019-10-04', 'Paypal -filter:retweets -filter:replies since:2019-10-04 until:2019-10-05', 'Paypal -filter:retweets -filter:replies since:2019-10-05 until:2019-10-06']
INFO: Got 18 tweets (18 new).
INFO: Got 37 tweets (19 new).
INFO: Got 51 tweets (14 new).
INFO: Got 71 tweets (20 new).
INFO: Got 91 tweets (20 new).
INFO: Got 110 tweets (19 new).
INFO: Got 160 tweets (50 new).
INFO: queries: ['Payp

INFO: Got 1319 tweets (1319 new).
INFO: Got 2640 tweets (1321 new).
INFO: Got 4115 tweets (1475 new).
INFO: Got 5480 tweets (1365 new).
INFO: Got 6922 tweets (1442 new).
INFO: Got 8329 tweets (1407 new).
INFO: Got 9918 tweets (1589 new).
INFO: queries: ['Paypal -filter:retweets -filter:replies since:2019-12-15 until:2019-12-16', 'Paypal -filter:retweets -filter:replies since:2019-12-16 until:2019-12-17', 'Paypal -filter:retweets -filter:replies since:2019-12-17 until:2019-12-18', 'Paypal -filter:retweets -filter:replies since:2019-12-18 until:2019-12-19', 'Paypal -filter:retweets -filter:replies since:2019-12-19 until:2019-12-20', 'Paypal -filter:retweets -filter:replies since:2019-12-20 until:2019-12-21', 'Paypal -filter:retweets -filter:replies since:2019-12-21 until:2019-12-22']
INFO: Got 19 tweets (19 new).
INFO: Got 38 tweets (19 new).
INFO: Got 53 tweets (15 new).
INFO: Got 72 tweets (19 new).
INFO: Got 112 tweets (40 new).
INFO: Got 148 tweets (36 new).
INFO: Got 188 tweets (40 

INFO: Got 10 tweets (10 new).
INFO: Got 29 tweets (19 new).
INFO: Got 45 tweets (16 new).
INFO: Got 63 tweets (18 new).
INFO: Got 83 tweets (20 new).
INFO: Got 100 tweets (17 new).
INFO: Got 119 tweets (19 new).
INFO: queries: ['Paypal -filter:retweets -filter:replies since:2020-03-01 until:2020-03-02', 'Paypal -filter:retweets -filter:replies since:2020-03-02 until:2020-03-03', 'Paypal -filter:retweets -filter:replies since:2020-03-03 until:2020-03-04', 'Paypal -filter:retweets -filter:replies since:2020-03-04 until:2020-03-05', 'Paypal -filter:retweets -filter:replies since:2020-03-05 until:2020-03-06', 'Paypal -filter:retweets -filter:replies since:2020-03-06 until:2020-03-07', 'Paypal -filter:retweets -filter:replies since:2020-03-07 until:2020-03-08']
INFO: Got 19 tweets (19 new).
INFO: Got 36 tweets (17 new).
INFO: Got 54 tweets (18 new).
INFO: Got 74 tweets (20 new).
INFO: Got 94 tweets (20 new).
INFO: Got 114 tweets (20 new).
INFO: Got 133 tweets (19 new).
INFO: queries: ['Payp

Elapsed time: 17091.44544315338


# Clean Twitter Data

- REMOVE SPECIAL CHARACTERS
- REMOVE URL
- REMOVE STOPWORDS
- NORMALIZE WORDS: lower case, verbs, numbers to letter

In [34]:
if __name__ == '__main__':
    start = time.time()
    print(time.strftime("%I:%M:%S"))

    companies = ['Nvidia']
    for comp in companies:
        #define fields that will be read
        #comp = 'Oracle' #Adobe
        timeframe = '_20170101_20200401' #_20170101_20200401
        fields = ['timestamp','img_urls','links','replies','retweets','text','tweet_id']
        types = {'timestamp':'str','img_urls':'str','links':'str','replies':'str','retweets':'str','text':'str','tweet_id': 'str'}
        ''' tweets : data with only datetime and tweet text columns '''
        tweets = pd.read_csv(comp + timeframe + '.csv', sep = "|", header=0,usecols=fields,skip_blank_lines=True, dtype=types) #, index_col='tweet_id'
        tweets = tweets[fields]
        
        ''' tweets_clean: new dataframe that will store cleaned tweets '''
        # tweets_clean = pd.DataFrame(columns=['Date','Tweets',])
        ''' loop to clean text data 
            remove_characters removes special characters and url
            remove_stopwords removes stopwords '''
        if tweets.size != 0:
            tweets['comp'] = comp
            #args = [tuple(x) for x in tweets.to_numpy()]
            args = tweets.to_records(index=False)
            #multiprocessing for the normalisation and analysis
            pool_nb = min(5*os.cpu_count(),len(args))
            print('Using a pool of ' + str(pool_nb) + ' Processes')
            multiprocessing.freeze_support()
            with multiprocessing.Pool(processes=pool_nb) as pool:
                results = pool.starmap(project_lib.multiprocess_execution, args.tolist())
            print('Multiprocessing execution terminated')

        # Convert to dataframe and remove duplicates
        data_cleaned_with_sent = pd.DataFrame.from_records(results)
        data_cleaned_with_sent = data_cleaned_with_sent.drop_duplicates()
        ''' save clean data to csv '''
        print('Saving data to csv file ' + comp + '_Clean' + timeframe + '.csv')
        data_cleaned_with_sent.to_csv(comp + '_Clean' + timeframe + '.csv',sep='|', header=True, index=True)

    elapsed = time.time() - start
    print(f"Elapsed time: {elapsed}")

08:13:09
Using a pool of 60 Processes
Multiprocessing execution terminated
Saving data to csv file Nvidia_Clean_20170101_20200401.csv
Elapsed time: 1767.8058140277863


# Create files with aggregated sentiments

In [3]:
companies = ['Microsoft', 'Apple', 'Mastercard','Intel Corp', 
             'Cisco Systems', 'Adobe', 'Nvidia',
             'Salesforce', 'PayPal', 'Oracle']
timeframe = '_20170101_20200401' #_20170101_20200401

for comp in companies:

    fields = ['timestamp','replies','retweets','Comp','Positive','Neutral','Negative']
    types = {'timestamp':'object','replies':'float64','retweets':'float64','Comp':'float64','Positive': 'float64','Neutral': 'float64','Negative': 'float64'}
    sentiments = pd.read_csv(comp + '_Clean' + timeframe + '.csv', sep = "|", header=0,usecols=fields,skip_blank_lines=True,dtype=types)#,dtype=types

    headers_sentiments = ['Comp','Positive','Negative']#,'CompWithRepRet'

    sentiments['Date']=pd.to_datetime(sentiments.timestamp).dt.date

    # Retrieve all sentiments
    sentiment_full = sentiments.groupby('Date').mean()

    sentiment_full.to_csv(comp + '_daily_sentiment_tweets.csv',sep='|', header=True, index=True, columns=headers_sentiments)
