### Import Libraries

In [12]:
# Import twitter dependencies
import tweepy
from config import *

In [1]:
# Import transformation dependencies
import pandas as pd
from datetime import datetime, date
import json
import re 

In [3]:
# For sentiment and subjectivity analysis
from textblob import TextBlob
#import nltk


# Tweepy Setup

In [9]:
# Tweepy Setup
auth = tweepy.OAuthHandler(api_key, api_secret_key)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)

In [10]:
# Tweepy search parameters
search_terms = ['covid+vaccine OR covid-19+vaccine OR corona+virus+vaccine OR coronavirus+vaccine OR pfizer OR pfizer-biontech OR pfizerbiontech OR moderna OR astrazeneca OR astra+zeneca -filter:retweets']
end_date = date(2021,3,11)
max_id = '1372910933236985856'
limit = 900 # this is my 15 minute limit :'( 

# Save files
output_csv ='csvs/tw186.csv'
output_json ='json/raw186.json'

# Parse Data

In [11]:
# Pulls data from api and returns 2 lists:
    # one which is parsed for selected variables
    # one with all the raw data

data = []
raw = []
tweets = tweepy.Cursor(api.search, max_id = max_id, lang ='en', q=search_terms, tweet_mode='extended').items(limit)
for t in tweets: 
    tweet ={
        'created' :t.created_at,
        'id_str' : t.id_str,
        'text' : t.full_text,
        'user_id' :t.user.id,
        'screen_name': t.user.screen_name,
        'location': t.user.location,
        'followers_count': t.user.followers_count,
        'user_favourites_count': t.user.favourites_count,
        'time_zone' :t.user.time_zone,
        'geo_enabled' :t.user.geo_enabled,
        'verified' :t.user.verified,
        'status_count': t.user.statuses_count,
        'geo':t.geo,
        'coords':t.coordinates,
        'retweet_count': t.retweet_count,
        'tweet_favourite_count' : t.favorite_count
    }
    # add sorted data to one list 
    data.append(tweet)
    # keep raw data in case I realize later that I've removed something important 
    raw.append(t)
print('done!')

done!


### Save a copy of Raw Data in case I need to go back to it 

In [12]:
# Add parsed data to dataframe, convert to csv
tweets_df = pd.DataFrame(data)

# Save raw data to json 
with open(output_json, 'w') as file:
    for resp in raw:
        jso = json.dumps(resp._json,sort_keys=True, indent=3)
        file.write(jso)
tweets_df.tail(2)

Unnamed: 0,created,id_str,text,user_id,screen_name,location,followers_count,user_favourites_count,time_zone,geo_enabled,verified,status_count,geo,coords,retweet_count,tweet_favourite_count
1398,2021-03-19 13:53:45,1372909160745082882,The European Medicines Agency (EMA) said that ...,385730875,OladapoAHassan,"Lagos, Nigeria",823,21791,,True,False,9006,,,0,0
1399,2021-03-19 13:53:45,1372909160086581253,"RT @DavidHenigUK: Find possible problem, exami...",2943643384,Anniepop2027,,1849,157765,,False,False,196642,,,149,0


# Clean tweets

In [13]:
# Remove extra rows
tweets_df = tweets_df.drop_duplicates(subset=['id_str'])
tweets_df = tweets_df.dropna(subset=['id_str', 'screen_name'])

In [14]:
# Makes tweets more readable 
def CleanTweets(txt): 
    # remove @ from username
    txt = txt.replace('@','')   
    
    # remove RT    
    txt = re.sub(r'RT[\s]+','',txt)
    
    # remove # but leave txt
    txt = re.sub(r'#','', txt)
    
    # remove hyperlinks
    txt = re.sub(r'https?:\/\/\S+', '', txt)
    
    txt = re.sub('\n','', txt)
    return txt

In [15]:
# Clean the text field 
tweets_df['text'] = tweets_df['text'].apply(CleanTweets)

# Analyze Tweets

In [16]:
# to get subjectivity 
def RateSubjectivity(txt):
    return TextBlob(txt).sentiment.subjectivity

In [17]:
# to get polarity
def RatePolarity(txt):
    return TextBlob(txt).sentiment.polarity

In [18]:
# add worded sentiment based on polarity score
def GetSentiment(num):
    if num < 0: 
        return 'negative' 
    elif num == 0:
        return 'neutral'
    else:
        return 'positive'

In [19]:
# Create Columns
def RateTweets(df):
    df['Subjectivity'] = df['text'].apply(RateSubjectivity)
    df['Polarity'] = df['text'].apply(RatePolarity)
    df['Sentiment'] = df['Polarity'].apply(GetSentiment)
    return df 

In [20]:
# Save a csv backup
tweets_df = RateTweets(tweets_df)
tweets_df.to_csv(output_csv, index=False, encoding='UTF-8')
tweets_df.tail(3)

Unnamed: 0,created,id_str,text,user_id,screen_name,location,followers_count,user_favourites_count,time_zone,geo_enabled,verified,status_count,geo,coords,retweet_count,tweet_favourite_count,Subjectivity,Polarity,Sentiment
1397,2021-03-19 13:53:46,1372909161663631376,Anyone else have covid brain from the vaccine 🥴😩,142061411,Kendall_Perry,,916,14583,,False,False,20785,,,0,0,0.0,0.0,neutral
1398,2021-03-19 13:53:45,1372909160745082882,The European Medicines Agency (EMA) said that ...,385730875,OladapoAHassan,"Lagos, Nigeria",823,21791,,True,False,9006,,,0,0,0.0,0.0,neutral
1399,2021-03-19 13:53:45,1372909160086581253,"DavidHenigUK: Find possible problem, examine i...",2943643384,Anniepop2027,,1849,157765,,False,False,196642,,,149,0,0.866667,0.233333,positive


### Comparing Polarity and Subjectivity by Manufacturer

In [4]:
df = df = pd.read_csv('csv/tweets.csv')
df.head()

Unnamed: 0,created,id_str,text,user_id,screen_name,location,followers_count,user_favourites_count,time_zone,geo_enabled,verified,status_count,geo,coords,retweet_count,tweet_favourite_count,Subjectivity,Polarity,Sentiment
0,2021-03-03 23:59:59,1367263516457979910,Shieldk2 Yes! My wife works for Pfizer and has...,2600327203,Sobres74,"Seattle, WA",91,2960,,False,False,3192,,,0,1,0.0,0.0,neutral
1,2021-03-03 23:59:59,1367263515656867840,"oleary_ray: ""I will not be taking the Covid va...",727678062918180864,cooksonm7,"Auckland, New Zealand",555,25880,,True,False,4078,,,92,0,0.2,0.2,positive
2,2021-03-03 23:59:59,1367263515388567563,"CDC's report on J&amp;J COVID-19 vaccine - ""AC...",15219316,bchaiken,"Boston, MA USA",511,4,,True,False,380,,,0,1,0.4,0.5,positive
3,2021-03-03 23:59:58,1367263513257930752,ByYourLogic: the Pfizer vaccine makes your dic...,1343038909769539584,OldManVEVO,,8,323,,False,False,70,,,102,0,0.6,0.2,positive
4,2021-03-03 23:59:56,1367263503782928384,RobDownenChron: The Archdiocese says the new J...,20774223,MagEGordon,"Houston, TX",4779,7215,,True,True,27559,,,1,0,0.234848,0.045455,positive


In [6]:
df = df.drop(columns=['user_id','time_zone','geo_enabled'])
df.head()

Unnamed: 0,created,id_str,text,screen_name,location,followers_count,user_favourites_count,verified,status_count,geo,coords,retweet_count,tweet_favourite_count,Subjectivity,Polarity,Sentiment
0,2021-03-03 23:59:59,1367263516457979910,Shieldk2 Yes! My wife works for Pfizer and has...,Sobres74,"Seattle, WA",91,2960,False,3192,,,0,1,0.0,0.0,neutral
1,2021-03-03 23:59:59,1367263515656867840,"oleary_ray: ""I will not be taking the Covid va...",cooksonm7,"Auckland, New Zealand",555,25880,False,4078,,,92,0,0.2,0.2,positive
2,2021-03-03 23:59:59,1367263515388567563,"CDC's report on J&amp;J COVID-19 vaccine - ""AC...",bchaiken,"Boston, MA USA",511,4,False,380,,,0,1,0.4,0.5,positive
3,2021-03-03 23:59:58,1367263513257930752,ByYourLogic: the Pfizer vaccine makes your dic...,OldManVEVO,,8,323,False,70,,,102,0,0.6,0.2,positive
4,2021-03-03 23:59:56,1367263503782928384,RobDownenChron: The Archdiocese says the new J...,MagEGordon,"Houston, TX",4779,7215,True,27559,,,1,0,0.234848,0.045455,positive


In [2]:
# Load saved tweets
df = pd.read_csv('csv/tweets.csv')
df = df[['id_str','text','retweet_count','tweet_favourite_count','Subjectivity','Polarity']]
df.head(2)

Unnamed: 0,id_str,text,retweet_count,tweet_favourite_count,Subjectivity,Polarity
0,1367263516457979910,Shieldk2 Yes! My wife works for Pfizer and has...,0,1,0.0,0.0
1,1367263515656867840,"oleary_ray: ""I will not be taking the Covid va...",92,0,0.2,0.2


In [5]:
df = pd.read_csv('csv/tweets.csv')
df = df[['id_str','text','retweet_count','tweet_favourite_count','Sentiment']]
df.head(2)

Unnamed: 0,id_str,text,retweet_count,tweet_favourite_count,Sentiment
0,1367263516457979910,Shieldk2 Yes! My wife works for Pfizer and has...,0,1,neutral
1,1367263515656867840,"oleary_ray: ""I will not be taking the Covid va...",92,0,positive


In [4]:
def GetManufacturer(txt):
    txt = txt.lower()
    if 'moderna' in txt:
        return 'mo'
    elif 'pfizer' in txt: 
        return 'pf'
    elif 'astra' in txt:
        return 'az'
    else:
        return '0'
    

In [4]:
df['manufacturer'] = df['text'].apply(GetManufacturer)
df.tail()

Unnamed: 0,id_str,text,retweet_count,tweet_favourite_count,Subjectivity,Polarity,manufacturer
232053,1371250540449988610,f0lake: No i dont think u understand i need to...,142,0,0.0,0.0,pf
232054,1371250536633208834,PeterHotez: It’s why we might eventually move ...,495,0,0.5,0.25,mo
232055,1371250532568956935,web_rant: mikeallen axios kadiagoba Gosh Mike ...,2,0,0.1,0.0,0
232056,1371250531516223488,Reuters: AstraZeneca finds no evidence of incr...,124,0,0.0,0.0,az
232057,1371250530123718660,Found my old WHO vaccine cert booklet from whe...,0,0,0.2,0.1,0


In [5]:
df = df.drop(columns=['id_str','text'])
manufacturer_df = df.loc[df['manufacturer'] != '0']
manufacturer_df = manufacturer_df.rename(columns={'retweet_count':'retweets','tweet_favourite_count':'likes'})
manufacturer_df.tail(2)

Unnamed: 0,retweets,likes,Subjectivity,Polarity,manufacturer
232054,495,0,0.5,0.25,mo
232056,124,0,0.0,0.0,az


In [9]:
df = df.drop(columns=['id_str','text'])
manufacturer_df = df.loc[df['manufacturer'] != '0']
manufacturer_df = manufacturer_df.rename(columns={'retweet_count':'retweets','tweet_favourite_count':'likes'})
manufacturer_df.tail(2)

Unnamed: 0,retweets,likes,Sentiment,manufacturer
232054,495,0,positive,mo
232056,124,0,neutral,az


In [6]:
manufacturer_df = manufacturer_df.loc[(manufacturer_df['retweets'] >0) | (manufacturer_df['likes'] >0) ]
manufacturer_df

Unnamed: 0,retweets,likes,Subjectivity,Polarity,manufacturer
0,0,1,0.000000,0.000000,pf
3,102,0,0.600000,0.200000,pf
6,2867,0,0.000000,0.100000,pf
12,102,0,0.000000,0.000000,az
17,102,0,0.600000,0.200000,pf
...,...,...,...,...,...
232047,117,0,0.801515,0.295455,pf
232049,335,0,0.275000,0.175000,pf
232053,142,0,0.000000,0.000000,pf
232054,495,0,0.500000,0.250000,mo


In [11]:
manufacturer_df['Subjectivity'] = manufacturer_df['Subjectivity'].round(decimals=1)
manufacturer_df['Polarity'] = manufacturer_df['Polarity'].round(decimals=1)
manufacturer_df

Unnamed: 0,retweets,likes,Subjectivity,Polarity,manufacturer
0,0,1,0.0,0.0,pf
3,102,0,0.6,0.2,pf
6,2867,0,0.0,0.1,pf
12,102,0,0.0,0.0,az
17,102,0,0.6,0.2,pf
...,...,...,...,...,...
232047,117,0,0.8,0.3,pf
232049,335,0,0.3,0.2,pf
232053,142,0,0.0,0.0,pf
232054,495,0,0.5,0.2,mo


In [12]:
mo_df = manufacturer_df.loc[manufacturer_df['manufacturer'] =='mo']
mo_df = mo_df.drop(columns=['manufacturer'])
mo_df.to_csv('mo.csv', index=False, encoding='UTF-8')

In [13]:
az_df = manufacturer_df.loc[manufacturer_df['manufacturer'] =='az']
az_df = az_df.drop(columns=['manufacturer'])
az_df.to_csv('az.csv', index=False, encoding='UTF-8')

In [14]:
pf_df = manufacturer_df.loc[manufacturer_df['manufacturer'] =='mo']
pf_df = pf_df.drop(columns=['manufacturer'])
pf_df.to_csv('pf.csv', index=False, encoding='UTF-8')

In [3]:
df = pd.read_csv('csv/tweets.csv')
df = df[['id_str','text','screen_name','retweet_count','tweet_favourite_count','Subjectivity','Polarity','Sentiment']]
df.head(2)

Unnamed: 0,id_str,text,screen_name,retweet_count,tweet_favourite_count,Subjectivity,Polarity,Sentiment
0,1367263516457979910,Shieldk2 Yes! My wife works for Pfizer and has...,Sobres74,0,1,0.0,0.0,neutral
1,1367263515656867840,"oleary_ray: ""I will not be taking the Covid va...",cooksonm7,92,0,0.2,0.2,positive


In [5]:
df['manufacturer'] = df['text'].apply(GetManufacturer)
df.tail()

Unnamed: 0,id_str,text,screen_name,retweet_count,tweet_favourite_count,Subjectivity,Polarity,Sentiment,manufacturer
232053,1371250540449988610,f0lake: No i dont think u understand i need to...,jenflowerr,142,0,0.0,0.0,neutral,pf
232054,1371250536633208834,PeterHotez: It’s why we might eventually move ...,karenh7463,495,0,0.5,0.25,positive,mo
232055,1371250532568956935,web_rant: mikeallen axios kadiagoba Gosh Mike ...,randypilsr,2,0,0.1,0.0,neutral,0
232056,1371250531516223488,Reuters: AstraZeneca finds no evidence of incr...,cadenjames1,124,0,0.0,0.0,neutral,az
232057,1371250530123718660,Found my old WHO vaccine cert booklet from whe...,Juliagoolia1982,0,0,0.2,0.1,positive,0


In [6]:
df2 = df.sample(n=2320)
df2

Unnamed: 0,id_str,text,screen_name,retweet_count,tweet_favourite_count,Subjectivity,Polarity,Sentiment,manufacturer
103095,1372937618535636992,nytimes: Governments across Europe raced to li...,drondari,48,0,0.588889,-0.216667,negative,az
107887,1372931320993710081,The Governor emphasizes these are eligibility ...,JackNBCBoston,1,0,0.375000,0.091667,positive,0
178226,1370118285677711369,alastanford: PHILADELPHIA: We are excited to a...,a_hildenbrand24,123,0,0.750000,0.375000,positive,mo
66657,1370036097795903489,MichelleObama: When you can get the COVID-19 v...,FxRuthie,8557,0,0.650000,0.750000,positive,0
171864,1370124184785854465,Reuters_Health: Pfizer Inc and BioNTech SE sai...,ShanePearse,52,0,0.000000,0.000000,neutral,pf
...,...,...,...,...,...,...,...,...,...
108444,1372930650798366723,pattracy2: JoyVBehar davidmweissman Something ...,RubbishRitaAZ,53,0,0.000000,0.000000,neutral,0
53112,1370043485110337539,BarackObama: Michelle and I got vaccinated aga...,radarroyo,10714,0,0.300000,1.000000,positive,0
40553,1370051200557576200,BarackObama: Michelle and I got vaccinated aga...,Aishwarya9_,10711,0,0.300000,1.000000,positive,0
4412,1370076216959635466,POTUS: One in four adults in the U.S. has rece...,dobbstojennings,16525,0,0.333333,0.250000,positive,0


# SQLAlchemy
### Connecting to a certain other host 

In [14]:
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine

In [15]:
engine = create_engine(conn2, echo=False) 

In [16]:
Base = automap_base()
Base.prepare(engine, reflect=True)  

In [17]:
# Test connection
Base.classes.keys()

['manufacturer', 'popularity']

# Creating the Tables
These are created to minimize the time needed to pull information from the database


In [None]:
# Index dataframe 
index_df = df[['id_str', 'created']]

In [22]:
# Popularity dataframe
popularity_df = df[['id_str','Subjectivity','Polarity','retweet_count','tweet_favourite_count','Sentiment']]

In [None]:
# text dataframe
text_df = df[['id_str', 'text']]

In [8]:
# sample of tweets 
sample_df = df.sample(frac=0.01, random_state=7)

### Adding to Database

In [18]:
def AddTable(df, table_name):
    df.to_sql(name=table_name, con=engine, if_exists='append', index=True)
    print('added!')

In [19]:
# AddTable(sample_df, 'sample')

added!


# Adventures in Geoparsing

In [None]:
from geograpy3 import extraction

In [49]:
try:
    e = extraction.Extractor(tweets_df['location'])
    e.find_geoEntities()
    print(e.places)
except: 
    pass


In [7]:
import geograpy3
from geograpy3 import places

In [None]:
def Geoparse(txt):
    if txt != '':
        places = geograpy3.get_place_context(text = txt)
        # return places.regions
        # return places.address_strings not this one at all 
    else:
        return '0'
    
def Geoparse2(txt): 
    if txt != '':
        loc = places.PlaceContext(txt)
        loc.set_countries()
        country = loc.countries
        
        loc.set_regions()
        region = loc.regions
       
        return region, country
    else:
        return '0'

def Geoparse3(txt):
    if txt != '':
        place = extraction.Extractor(text = txt)
        place.find_entities()
        return place.places
    else:
        return '0'
    

In [98]:
#_df['new_location'] = _df['location'].apply(Geoparse3)
_df['newer_location'] = _df['new_location'].apply(Geoparse2)
_df

Unnamed: 0,created,id_str,text,user_id,screen_name,location,followers_count,user_favourites_count,time_zone,geo_enabled,...,status_count,geo,coords,retweet_count,tweet_favourite_count,Subjectivity,Polarity,Sentiment,new_location,newer_location
0,2021-03-11 15:21:24,1370032112485666818,angelovalidiya: I am asking again are we lab r...,825712966683262978,necas55,,440,10774,,True,...,15868,,,128,0,0.0,0.0,neutral,0,"([0], [Côte d'Ivoire])"
1,2021-03-11 15:21:24,1370032112447987715,POTUS: One in four adults in the U.S. has rece...,259461117,Amanda_L_Smith,"Warren, MI",2588,71805,,True,...,13890,,,16536,0,0.333333,0.25,positive,"[Warren, MI]","([Warren, MI], [Warren, MI])"
2,2021-03-11 15:21:24,1370032112175304705,POTUS: One in four adults in the U.S. has rece...,20353953,GoKTGo,"New York, NY",2606,18461,,True,...,20052,,,16536,0,0.333333,0.25,positive,"[New York, NY]","([NY, New York], [NY, New York])"
3,2021-03-11 15:21:24,1370032112074625025,"CNNPolitics: In a new ad campaign, former Pres...",1239283251950374912,JC38054211,"Coconut Grove, FL",77,1337,,False,...,276,,,595,0,0.138636,0.034091,positive,"[Coconut, Grove, FL]","([Grove, Coconut, FL], [Grove, Coconut, FL])"
4,2021-03-11 15:21:23,1370032111319724036,Big pharma monopolies lead to countries waitin...,580808293,Rebeccaxlcr,,64,388,,False,...,8280,,,0,0,0.1,0.0,neutral,0,"([0], [Côte d'Ivoire])"
5,2021-03-11 15:21:23,1370032111239917568,Pfizer Shot 97% Effective Against Symptomatic ...,453857290,ThoughtSow,K-Pac,4324,4690,,True,...,24899,,,0,0,0.8,0.6,positive,[],"([], [])"
6,2021-03-11 15:21:23,1370032110757683204,Cold_Peace_ JamesSchwemlein jamescrabtree Rman...,2536510904,EvanFeigenbaum,"Washington, DC",14186,18971,,True,...,47365,,,0,2,1.0,0.0,neutral,"[Washington, DC]","([DC, Washington], [DC, Washington])"
7,2021-03-11 15:21:23,1370032110464077831,Kroger clinic patients given empty COVID-19 va...,251130012,joschroweArt,🌎🌏🌍,1100,60458,,False,...,20942,,,0,0,0.5,-0.1,negative,[],"([], [])"
8,2021-03-11 15:21:23,1370032109893672965,mariamainmo: Girls don't want boys. They want ...,223149595,moyenmoins,"Paris, France",284,47587,,False,...,7705,,,25410,0,1.0,0.6,positive,"[Paris, France]","([Paris, France], [Paris, France])"
9,2021-03-11 15:21:23,1370032108870197248,TheElders: “We must act with collective respon...,233041292,niawag1,washington dc,3766,67537,,False,...,12408,,,70,0,0.3625,0.25,positive,[],"([], [])"


In [48]:
len(df)

232058

In [59]:
loc_df = df
loc_df = loc_df.dropna(subset=['location'])
len(loc_df)

159149

In [53]:
known_cords = df['coords'].unique()
len(known_cords)

259

In [None]:
loc_df['new_location'] = loc_df['location'].apply(Geoparse3)
loc_df.head()