### Import Libraries

In [7]:
# Import twitter dependencies
import tweepy
from config import *

In [8]:
# Import transformation dependencies
import pandas as pd
from datetime import datetime, date
import json
import re 

In [133]:
# For sentiment and subjectivity analysis
from textblob import TextBlob
import nltk


In [None]:
nltk.download('stopwords')
nltk.download('punkt')

In [10]:
# For stats 
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_rel.html
from scipy import stats

# Tweepy Setup

In [11]:
# Tweepy Setup
auth = tweepy.OAuthHandler(api_key, api_secret_key)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)

In [169]:
# Tweepy search parameters
search_terms = ['covid+vaccine OR covid-19+vaccine OR corona+virus+vaccine OR coronavirus+vaccine OR pfizer OR pfizer-biontech OR pfizerbiontech OR moderna OR astrazeneca OR astra+zeneca -filter:retweets']
end_date = date(2021,3,11)
max_id = '1370060291237576713'
limit = 1400 # this is my 15 minute limit :'( 

# Save files
output_csv ='csvs/tw118.csv'
output_json ='json/raw118.json'

# Functions

Collect and save data

In [170]:
# Pulls data from api and returns 2 lists:
    # one which is parsed for selected variables
    # one with all the raw data

data = []
raw = []
tweets = tweepy.Cursor(api.search, max_id = max_id, lang ='en', q=search_terms, tweet_mode='extended').items(limit)
for t in tweets: 
    tweet ={
        'created' :t.created_at,
        'id_str' : t.id_str,
        'text' : t.full_text,
        'user_id' :t.user.id,
        'screen_name': t.user.screen_name,
        'location': t.user.location,
        'followers_count': t.user.followers_count,
        'user_favourites_count': t.user.favourites_count,
        'time_zone' :t.user.time_zone,
        'geo_enabled' :t.user.geo_enabled,
        'verified' :t.user.verified,
        'status_count': t.user.statuses_count,
        'geo':t.geo,
        'coords':t.coordinates,
        'retweet_count': t.retweet_count,
        'tweet_favourite_count' : t.favorite_count
    }
    # add sorted data to one list 
    data.append(tweet)
    # keep raw data in case I realize later that I've removed something important 
    raw.append(t)
print('done!')

done!


In [171]:
# Add parsed data to dataframe, convert to csv
tweets_df = pd.DataFrame(data)

# Save raw data to json 
with open(output_json, 'w') as file:
    for resp in raw:
        jso = json.dumps(resp._json,sort_keys=True, indent=3)
        file.write(jso)
tweets_df.tail(2)

Unnamed: 0,created,id_str,text,user_id,screen_name,location,followers_count,user_favourites_count,time_zone,geo_enabled,verified,status_count,geo,coords,retweet_count,tweet_favourite_count
1398,2021-03-11 17:09:22,1370059284579414016,"Denmark, Norway, Iceland Suspend Use Of AstraZ...",326632316,jollymampilly,"India, Kerala.",1519,5137,,True,False,47539,,,0,0
1399,2021-03-11 17:09:22,1370059284038422531,"RT @solmamakwa: On March 1, 2021, I was invite...",23807006,AlexCallahan,"Toronto, ON",590,135,,False,False,7065,,,238,0


### Clean tweets

In [172]:
# not working when I put these into a function
tweets_df = tweets_df.drop_duplicates(subset=['id_str'])
tweets_df = tweets_df.dropna(subset=['id_str', 'screen_name'])

In [173]:
# Makes tweets more readable 
def CleanTweets(txt): 
    # remove @ username 
    # txt = re.sub(r'@[A-Za-z0-9)]+','', txt)
    # this removes names, which are sometimes vaccine names, so not good for my filter

    # remove @ from username
    txt = txt.replace('@','')   
    
    # remove RT    
    txt = re.sub(r'RT[\s]+','',txt)
    
    # remove # but leave txt
    txt = re.sub(r'#','', txt)
    
    # remove hyperlinks
    txt = re.sub(r'https?:\/\/\S+', '', txt)
    
    txt = re.sub('\n','', txt)
    return txt

In [174]:
# Clean the text field 
tweets_df['text'] = tweets_df['text'].apply(CleanTweets)

In [175]:
# to get subjectivity 
def RateSubjectivity(txt):
    return TextBlob(txt).sentiment.subjectivity

In [176]:
# to get polarity
def RatePolarity(txt):
    return TextBlob(txt).sentiment.polarity

In [177]:
# add worded sentiment based on polarity score
def GetSentiment(num):
    if num < 0: 
        return 'negative' 
    elif num == 0:
        return 'neutral'
    else:
        return 'positive'

In [178]:
# Create Columns
def RateTweets(df):
    df['Subjectivity'] = df['text'].apply(RateSubjectivity)
    df['Polarity'] = df['text'].apply(RatePolarity)
    df['Sentiment'] = df['Polarity'].apply(GetSentiment)
    return df 

In [179]:
tweets_df = RateTweets(tweets_df)
tweets_df.to_csv(output_csv, index=False, encoding='UTF-8')
tweets_df.tail(3)

Unnamed: 0,created,id_str,text,user_id,screen_name,location,followers_count,user_favourites_count,time_zone,geo_enabled,verified,status_count,geo,coords,retweet_count,tweet_favourite_count,Subjectivity,Polarity,Sentiment
1397,2021-03-11 17:09:22,1370059284684357633,Shawn_SoPeachy: Pfizer not innocent out here e...,34059643,AirJhourdi,IG:airjhourdi,1765,303,,True,False,215140,,,5,0,0.7,-0.25,negative
1398,2021-03-11 17:09:22,1370059284579414016,"Denmark, Norway, Iceland Suspend Use Of AstraZ...",326632316,jollymampilly,"India, Kerala.",1519,5137,,True,False,47539,,,0,0,0.0,0.0,neutral
1399,2021-03-11 17:09:22,1370059284038422531,"solmamakwa: On March 1, 2021, I was invited by...",23807006,AlexCallahan,"Toronto, ON",590,135,,False,False,7065,,,238,0,0.166667,0.125,positive


In [180]:
from os import listdir

In [181]:
filepaths = ['csvs/' + f for f in listdir("csvs")]
df = pd.concat(map(pd.read_csv, filepaths))
df = df.drop_duplicates(subset=['id_str'])
len(df)

139727

In [185]:
df = df.reset_index()
df = df.drop(columns=['index'])
df.tail(10)

Unnamed: 0,created,id_str,text,user_id,screen_name,location,followers_count,user_favourites_count,time_zone,geo_enabled,verified,status_count,geo,coords,retweet_count,tweet_favourite_count,Subjectivity,Polarity,Sentiment
139717,2021-03-15 00:03:01,1371250548201062410,B52Malmet: A year ago we were clapping every n...,3306650467,Maier12345,"Sunnyvale, CA",1189,187723,,False,False,279636,,,764,0,0.3,0.2,positive
139718,2021-03-15 00:03:01,1371250547454533632,Jessicam6946: Everyone getting their vaccines....,1347611295915388933,meredit71840781,Iowa,168,11798,,False,False,12500,,,335,0,0.275,0.175,positive
139719,2021-03-15 00:03:01,1371250545130930185,newsmax: Ric Grenell: Biden's only 'reordering...,981234081328521216,Mike86961326,Western N.Y.,328,23357,,True,False,36557,,,1707,0,0.75,-0.1,negative
139720,2021-03-15 00:03:00,1371250543616798723,"bluestein: On Monday, the big day arrives for ...",258967043,biscuitkitten,Atlanta,6699,39802,,True,False,55538,,,24,0,0.3,0.25,positive
139721,2021-03-15 00:03:00,1371250542219915267,Dr Swamy39 ji\n\nNetherlands halts use of Astr...,48274874,i_amdinesh,India,1826,475,,True,False,2507,,,0,0,0.0,0.0,neutral
139722,2021-03-15 00:03:00,1371250540449988610,f0lake: No i dont think u understand i need to...,1078018972975476736,jenflowerr,"California, USA",20,2299,,False,False,1808,,,142,0,0.0,0.0,neutral
139723,2021-03-15 00:02:59,1371250536633208834,PeterHotez: It’s why we might eventually move ...,828324944522141696,karenh7463,"Indiana, USA",641,188950,,True,False,53018,,,495,0,0.5,0.25,positive
139724,2021-03-15 00:02:58,1371250532568956935,web_rant: mikeallen axios kadiagoba Gosh Mike ...,831767249723731981,randypilsr,"Arkansas, USA",456,22502,,True,False,12150,,,2,0,0.1,0.0,neutral
139725,2021-03-15 00:02:57,1371250531516223488,Reuters: AstraZeneca finds no evidence of incr...,290224215,cadenjames1,Netherlands,141,172,,True,False,11705,,,124,0,0.0,0.0,neutral
139726,2021-03-15 00:02:57,1371250530123718660,Found my old WHO vaccine cert booklet from whe...,92157908,Juliagoolia1982,Anywhere but here,761,16382,,True,False,13107,,,0,0,0.2,0.1,positive


# SQLAlchemy
### Connecting to a certain other host 

In [165]:
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine

In [166]:
engine = create_engine(conn, echo=False)

In [167]:
Base = automap_base()
Base.prepare(engine, reflect=True)  

In [168]:
# Test connection
# it works!!!
# not anymore
# kill kernal if conn is wrong

Base.classes.keys()

['tw2']

### Don't run yet
Clean up your tweets! 

In [186]:
# Send data frame to database 
# last in 118
df.to_sql(name='twt100', con=engine, if_exists='append', index=True)
print('added!')

added!


In [None]:
# session things in case I ever need to open one
session = Session(engine)
session.close()


In [344]:
import schedule
import time

In [None]:
def EveryFifteen():
    print('time')
    
schedule.every(15).minutes


### Stopwords 

In [163]:
def Tokenize(txt):
    return TextBlob(txt).words

In [140]:
dt = [word for word in tt if word not in stop_words]
dt

['mother', 'mashed', 'mnms']

In [98]:
teststring ='my mother mashed my mnms'
tt = TextBlob(teststring).words
tt

WordList(['my', 'mother', 'mashed', 'my', 'mnms'])

In [None]:
def RemoveStopWord(txt):
    stop_words = stopwords.words('english')
    no_stops = [word for word in lst if word not in stopwords]
    return 


In [139]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')