### Import Libraries

In [1]:
# Import twitter dependencies
import tweepy
from config import *

In [2]:
# Import transformation dependencies
import pandas as pd
from datetime import datetime, date
import json
import re 

In [3]:
# For sentiment and subjectivity analysis
from textblob import TextBlob

In [4]:
# For stats 
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_rel.html
from scipy import stats

# Tweepy Setup

In [5]:
# Tweepy Setup
auth = tweepy.OAuthHandler(api_key, api_secret_key)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)

In [107]:
# Tweepy search parameters
search_terms = ['covid+vaccine OR covid-19+vaccine OR corona+virus+vaccine OR coronavirus+vaccine OR pfizer OR pfizer-biontech OR pfizerbiontech OR moderna OR astrazeneca OR astra+zeneca -filter:retweets']
end_date = date(2021,3,11)
max_id = '1370071693172035592'
limit = 1400 # this is my 15 minute limit :'( 

# Save files
output_csv ='csvs/tw106.csv'
output_json ='json/raw106.json'

# Functions

Collect and save data

In [108]:
# Pulls data from api and returns 2 lists:
    # one which is parsed for selected variables
    # one with all the raw data

data = []
raw = []
tweets = tweepy.Cursor(api.search, max_id = max_id, lang ='en', q=search_terms, tweet_mode='extended').items(limit)
for t in tweets: 
    tweet ={
        'created' :t.created_at,
        'id_str' : t.id_str,
        'text' : t.full_text,
        'user_id' :t.user.id,
        'screen_name': t.user.screen_name,
        'location': t.user.location,
        'followers_count': t.user.followers_count,
        'user_favourites_count': t.user.favourites_count,
        'time_zone' :t.user.time_zone,
        'geo_enabled' :t.user.geo_enabled,
        'verified' :t.user.verified,
        'status_count': t.user.statuses_count,
        'geo':t.geo,
        'coords':t.coordinates,
        'retweet_count': t.retweet_count,
        'tweet_favourite_count' : t.favorite_count
    }
    # add sorted data to one list 
    data.append(tweet)
    # keep raw data in case I realize later that I've removed something important 
    raw.append(t)
print('done!')

done!


In [109]:
# Add parsed data to dataframe, convert to csv
tweets_df = pd.DataFrame(data)

# Save raw data to json 
with open(output_json, 'w') as file:
    for resp in raw:
        jso = json.dumps(resp._json,sort_keys=True, indent=3)
        file.write(jso)
tweets_df.tail(2)

Unnamed: 0,created,id_str,text,user_id,screen_name,location,followers_count,user_favourites_count,time_zone,geo_enabled,verified,status_count,geo,coords,retweet_count,tweet_favourite_count
1398,2021-03-11 17:54:15,1370070579450150935,RT @WSJ: From @WSJopinion: The CDC claims to b...,828281213421027329,MichaelOngRisk,,3807,18455,,False,False,11089,,,19,0
1399,2021-03-11 17:54:15,1370070578959376399,"I just signed up on the ""TriStar Summit Second...",227561660,loislanekent1,,1722,83142,,False,False,25872,,,0,2


### Clean tweets

In [110]:
# not working when I put these into a function
tweets_df = tweets_df.drop_duplicates(subset=['id_str'])
tweets_df = tweets_df.dropna(subset=['id_str', 'screen_name'])

In [111]:
# Makes tweets more readable 
def CleanTweets(txt): 
    # remove @ username 
    # txt = re.sub(r'@[A-Za-z0-9)]+','', txt)
    # this removes names, which are sometimes vaccine names, so not good for my filter

    # remove @ from username
    txt = txt.replace('@','')   
    
    # remove RT    
    txt = re.sub(r'RT[\s]+','',txt)
    
    # remove # but leave txt
    txt = re.sub(r'#','', txt)
    
    # remove hyperlinks
    txt = re.sub(r'https?:\/\/\S+', '', txt)
    
    txt = re.sub('\n','', txt)
    return txt

In [112]:
# Clean the text field 
tweets_df['text'] = tweets_df['text'].apply(CleanTweets)

In [113]:
# to get subjectivity 
def RateSubjectivity(txt):
    return TextBlob(txt).sentiment.subjectivity

In [114]:
# to get polarity
def RatePolarity(txt):
    return TextBlob(txt).sentiment.polarity

In [115]:
# add worded sentiment based on polarity score
def GetSentiment(num):
    if num < 0: 
        return 'negative' 
    elif num == 0:
        return 'neutral'
    else:
        return 'positive'

In [116]:
# Create Columns
def RateTweets(df):
    df['Subjectivity'] = df['text'].apply(RateSubjectivity)
    df['Polarity'] = df['text'].apply(RatePolarity)
    df['Sentiment'] = df['Polarity'].apply(GetSentiment)
    return df 

In [117]:
tweets_df = RateTweets(tweets_df)
tweets_df.to_csv(output_csv, index=False, encoding='UTF-8')
tweets_df.tail(3)

Unnamed: 0,created,id_str,text,user_id,screen_name,location,followers_count,user_favourites_count,time_zone,geo_enabled,verified,status_count,geo,coords,retweet_count,tweet_favourite_count,Subjectivity,Polarity,Sentiment
1397,2021-03-11 17:54:15,1370070581085933573,POTUS: One in four adults in the U.S. has rece...,31425741,seekingeldorado,Now,924,158339,,False,False,71132,,,16526,0,0.333333,0.25,positive
1398,2021-03-11 17:54:15,1370070579450150935,WSJ: From WSJopinion: The CDC claims to be “fo...,828281213421027329,MichaelOngRisk,,3807,18455,,False,False,11089,,,19,0,0.1,0.0,neutral
1399,2021-03-11 17:54:15,1370070578959376399,"I just signed up on the ""TriStar Summit Second...",227561660,loislanekent1,,1722,83142,,False,False,25872,,,0,2,0.0,0.0,neutral


In [106]:
# tweets_df[['created','id_str','text','screen_name','location','Subjectivity','Polarity','Sentiment']].tail(5)

In [18]:
from os import listdir

In [52]:
filepaths = ['csvs/' + f for f in listdir("csvs")]
df = pd.concat(map(pd.read_csv, filepaths))
df = df.drop_duplicates(subset=['id_str'])
len(df)

117343

In [55]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

In [118]:
#df['tokenized'] = df.apply(lambda row: word_tokenize(row['text']), axis=1)
#df['tokenized'] = df['text'].apply(lambda x: [item for item in x if item not in stop])
df.head()

Unnamed: 0,created,id_str,text,user_id,screen_name,location,followers_count,user_favourites_count,time_zone,geo_enabled,verified,status_count,geo,coords,retweet_count,tweet_favourite_count,Subjectivity,Polarity,Sentiment
0,2021-03-03 23:59:59,1367263516457979910,Shieldk2 Yes! My wife works for Pfizer and has...,2600327203,Sobres74,"Seattle, WA",91,2960,,False,False,3192,,,0,1,0.0,0.0,neutral
1,2021-03-03 23:59:59,1367263515656867840,"oleary_ray: ""I will not be taking the Covid va...",727678062918180864,cooksonm7,"Auckland, New Zealand",555,25880,,True,False,4078,,,92,0,0.2,0.2,positive
2,2021-03-03 23:59:59,1367263515388567563,"CDC's report on J&amp;J COVID-19 vaccine - ""AC...",15219316,bchaiken,"Boston, MA USA",511,4,,True,False,380,,,0,1,0.4,0.5,positive
3,2021-03-03 23:59:58,1367263513257930752,ByYourLogic: the Pfizer vaccine makes your dic...,1343038909769539584,OldManVEVO,,8,323,,False,False,70,,,102,0,0.6,0.2,positive
4,2021-03-03 23:59:56,1367263503782928384,RobDownenChron: The Archdiocese says the new J...,20774223,MagEGordon,"Houston, TX",4779,7215,,True,True,27559,,,1,0,0.234848,0.045455,positive


# SQLAlchemy
### Connecting to a certain other host 

In [5]:
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine

In [12]:
engine = create_engine(conn, echo=False)

In [13]:
Base = automap_base()
Base.prepare(engine, reflect=True)  

In [14]:
# Test connection
# it works!!!
# not anymore
# kill kernal if conn is wrong

Base.classes.keys()

['tw2']

### Don't run yet
Clean up your tweets! 

In [9]:
_df = pd.read_csv('csvs/tw2.csv')
_df.to_sql(name='tw2', con=engine, if_exists='append', index=True)
print('added!')

added!


In [None]:
# session things
session = Session(engine)
session.close()


In [344]:
import schedule
import time

In [None]:
def EveryFifteen():
    print('time')
    
schedule.every(15).minutes
