### Import Libraries

In [81]:
# Import twitter dependencies
import tweepy
from config import *

In [82]:
# Import transformation dependencies
import pandas as pd
from datetime import datetime, date
import json
import re 

In [83]:
# For sentiment and subjectivity analysis
from textblob import TextBlob
#import nltk


In [84]:
#nltk.download('stopwords')
#nltk.download('punkt')

In [85]:
# For stats 
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_rel.html
from scipy import stats

# Tweepy Setup

In [9]:
# Tweepy Setup
auth = tweepy.OAuthHandler(api_key, api_secret_key)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)

In [10]:
# Tweepy search parameters
search_terms = ['covid+vaccine OR covid-19+vaccine OR corona+virus+vaccine OR coronavirus+vaccine OR pfizer OR pfizer-biontech OR pfizerbiontech OR moderna OR astrazeneca OR astra+zeneca -filter:retweets']
end_date = date(2021,3,11)
max_id = '1372910933236985856'
limit = 1400 # this is my 15 minute limit :'( 

# Save files
output_csv ='csvs/tw186.csv'
output_json ='json/raw186.json'

# Functions

Collect and save data

In [11]:
# Pulls data from api and returns 2 lists:
    # one which is parsed for selected variables
    # one with all the raw data

data = []
raw = []
tweets = tweepy.Cursor(api.search, max_id = max_id, lang ='en', q=search_terms, tweet_mode='extended').items(limit)
for t in tweets: 
    tweet ={
        'created' :t.created_at,
        'id_str' : t.id_str,
        'text' : t.full_text,
        'user_id' :t.user.id,
        'screen_name': t.user.screen_name,
        'location': t.user.location,
        'followers_count': t.user.followers_count,
        'user_favourites_count': t.user.favourites_count,
        'time_zone' :t.user.time_zone,
        'geo_enabled' :t.user.geo_enabled,
        'verified' :t.user.verified,
        'status_count': t.user.statuses_count,
        'geo':t.geo,
        'coords':t.coordinates,
        'retweet_count': t.retweet_count,
        'tweet_favourite_count' : t.favorite_count
    }
    # add sorted data to one list 
    data.append(tweet)
    # keep raw data in case I realize later that I've removed something important 
    raw.append(t)
print('done!')

done!


In [12]:
# Add parsed data to dataframe, convert to csv
tweets_df = pd.DataFrame(data)

# Save raw data to json 
with open(output_json, 'w') as file:
    for resp in raw:
        jso = json.dumps(resp._json,sort_keys=True, indent=3)
        file.write(jso)
tweets_df.tail(2)

Unnamed: 0,created,id_str,text,user_id,screen_name,location,followers_count,user_favourites_count,time_zone,geo_enabled,verified,status_count,geo,coords,retweet_count,tweet_favourite_count
1398,2021-03-19 13:53:45,1372909160745082882,The European Medicines Agency (EMA) said that ...,385730875,OladapoAHassan,"Lagos, Nigeria",823,21791,,True,False,9006,,,0,0
1399,2021-03-19 13:53:45,1372909160086581253,"RT @DavidHenigUK: Find possible problem, exami...",2943643384,Anniepop2027,,1849,157765,,False,False,196642,,,149,0


### Clean tweets

In [13]:
# Remove extra rows
tweets_df = tweets_df.drop_duplicates(subset=['id_str'])
tweets_df = tweets_df.dropna(subset=['id_str', 'screen_name'])

In [14]:
# Makes tweets more readable 
def CleanTweets(txt): 
    # remove @ from username
    txt = txt.replace('@','')   
    
    # remove RT    
    txt = re.sub(r'RT[\s]+','',txt)
    
    # remove # but leave txt
    txt = re.sub(r'#','', txt)
    
    # remove hyperlinks
    txt = re.sub(r'https?:\/\/\S+', '', txt)
    
    txt = re.sub('\n','', txt)
    return txt

In [15]:
# Clean the text field 
tweets_df['text'] = tweets_df['text'].apply(CleanTweets)

In [16]:
# to get subjectivity 
def RateSubjectivity(txt):
    return TextBlob(txt).sentiment.subjectivity

In [17]:
# to get polarity
def RatePolarity(txt):
    return TextBlob(txt).sentiment.polarity

In [18]:
# add worded sentiment based on polarity score
def GetSentiment(num):
    if num < 0: 
        return 'negative' 
    elif num == 0:
        return 'neutral'
    else:
        return 'positive'

In [19]:
# Create Columns
def RateTweets(df):
    df['Subjectivity'] = df['text'].apply(RateSubjectivity)
    df['Polarity'] = df['text'].apply(RatePolarity)
    df['Sentiment'] = df['Polarity'].apply(GetSentiment)
    return df 

In [20]:
# Save a csv backup
tweets_df = RateTweets(tweets_df)
tweets_df.to_csv(output_csv, index=False, encoding='UTF-8')
tweets_df.tail(3)

Unnamed: 0,created,id_str,text,user_id,screen_name,location,followers_count,user_favourites_count,time_zone,geo_enabled,verified,status_count,geo,coords,retweet_count,tweet_favourite_count,Subjectivity,Polarity,Sentiment
1397,2021-03-19 13:53:46,1372909161663631376,Anyone else have covid brain from the vaccine 🥴😩,142061411,Kendall_Perry,,916,14583,,False,False,20785,,,0,0,0.0,0.0,neutral
1398,2021-03-19 13:53:45,1372909160745082882,The European Medicines Agency (EMA) said that ...,385730875,OladapoAHassan,"Lagos, Nigeria",823,21791,,True,False,9006,,,0,0,0.0,0.0,neutral
1399,2021-03-19 13:53:45,1372909160086581253,"DavidHenigUK: Find possible problem, examine i...",2943643384,Anniepop2027,,1849,157765,,False,False,196642,,,149,0,0.866667,0.233333,positive


# Geoparsing

In [73]:
tweets_df.head(10)

Unnamed: 0,created,id_str,text,user_id,screen_name,location,followers_count,user_favourites_count,time_zone,geo_enabled,verified,status_count,geo,coords,retweet_count,tweet_favourite_count,Subjectivity,Polarity,Sentiment
0,2021-03-11 15:21:24,1370032112485666818,angelovalidiya: I am asking again are we lab r...,825712966683262978,necas55,,440,10774,,True,False,15868,,,128,0,0.0,0.0,neutral
1,2021-03-11 15:21:24,1370032112447987715,POTUS: One in four adults in the U.S. has rece...,259461117,Amanda_L_Smith,"Warren, MI",2588,71805,,True,False,13890,,,16536,0,0.333333,0.25,positive
2,2021-03-11 15:21:24,1370032112175304705,POTUS: One in four adults in the U.S. has rece...,20353953,GoKTGo,"New York, NY",2606,18461,,True,False,20052,,,16536,0,0.333333,0.25,positive
3,2021-03-11 15:21:24,1370032112074625025,"CNNPolitics: In a new ad campaign, former Pres...",1239283251950374912,JC38054211,"Coconut Grove, FL",77,1337,,False,False,276,,,595,0,0.138636,0.034091,positive
4,2021-03-11 15:21:23,1370032111319724036,Big pharma monopolies lead to countries waitin...,580808293,Rebeccaxlcr,,64,388,,False,False,8280,,,0,0,0.1,0.0,neutral
5,2021-03-11 15:21:23,1370032111239917568,Pfizer Shot 97% Effective Against Symptomatic ...,453857290,ThoughtSow,K-Pac,4324,4690,,True,False,24899,,,0,0,0.8,0.6,positive
6,2021-03-11 15:21:23,1370032110757683204,Cold_Peace_ JamesSchwemlein jamescrabtree Rman...,2536510904,EvanFeigenbaum,"Washington, DC",14186,18971,,True,False,47365,,,0,2,1.0,0.0,neutral
7,2021-03-11 15:21:23,1370032110464077831,Kroger clinic patients given empty COVID-19 va...,251130012,joschroweArt,🌎🌏🌍,1100,60458,,False,False,20942,,,0,0,0.5,-0.1,negative
8,2021-03-11 15:21:23,1370032109893672965,mariamainmo: Girls don't want boys. They want ...,223149595,moyenmoins,"Paris, France",284,47587,,False,False,7705,,,25410,0,1.0,0.6,positive
9,2021-03-11 15:21:23,1370032108870197248,TheElders: “We must act with collective respon...,233041292,niawag1,washington dc,3766,67537,,False,False,12408,,,70,0,0.3625,0.25,positive


In [46]:
from geograpy import extraction

In [49]:
try:
    e = extraction.Extractor(tweets_df['location'])
    e.find_geoEntities()
    print(e.places)
except: 
    pass


In [74]:
#_df = tweets_df.iloc[0:10]
_df

Unnamed: 0,created,id_str,text,user_id,screen_name,location,followers_count,user_favourites_count,time_zone,geo_enabled,verified,status_count,geo,coords,retweet_count,tweet_favourite_count,Subjectivity,Polarity,Sentiment
0,2021-03-11 15:21:24,1370032112485666818,angelovalidiya: I am asking again are we lab r...,825712966683262978,necas55,,440,10774,,True,False,15868,,,128,0,0.0,0.0,neutral
1,2021-03-11 15:21:24,1370032112447987715,POTUS: One in four adults in the U.S. has rece...,259461117,Amanda_L_Smith,"Warren, MI",2588,71805,,True,False,13890,,,16536,0,0.333333,0.25,positive
2,2021-03-11 15:21:24,1370032112175304705,POTUS: One in four adults in the U.S. has rece...,20353953,GoKTGo,"New York, NY",2606,18461,,True,False,20052,,,16536,0,0.333333,0.25,positive
3,2021-03-11 15:21:24,1370032112074625025,"CNNPolitics: In a new ad campaign, former Pres...",1239283251950374912,JC38054211,"Coconut Grove, FL",77,1337,,False,False,276,,,595,0,0.138636,0.034091,positive
4,2021-03-11 15:21:23,1370032111319724036,Big pharma monopolies lead to countries waitin...,580808293,Rebeccaxlcr,,64,388,,False,False,8280,,,0,0,0.1,0.0,neutral
5,2021-03-11 15:21:23,1370032111239917568,Pfizer Shot 97% Effective Against Symptomatic ...,453857290,ThoughtSow,K-Pac,4324,4690,,True,False,24899,,,0,0,0.8,0.6,positive
6,2021-03-11 15:21:23,1370032110757683204,Cold_Peace_ JamesSchwemlein jamescrabtree Rman...,2536510904,EvanFeigenbaum,"Washington, DC",14186,18971,,True,False,47365,,,0,2,1.0,0.0,neutral
7,2021-03-11 15:21:23,1370032110464077831,Kroger clinic patients given empty COVID-19 va...,251130012,joschroweArt,🌎🌏🌍,1100,60458,,False,False,20942,,,0,0,0.5,-0.1,negative
8,2021-03-11 15:21:23,1370032109893672965,mariamainmo: Girls don't want boys. They want ...,223149595,moyenmoins,"Paris, France",284,47587,,False,False,7705,,,25410,0,1.0,0.6,positive
9,2021-03-11 15:21:23,1370032108870197248,TheElders: “We must act with collective respon...,233041292,niawag1,washington dc,3766,67537,,False,False,12408,,,70,0,0.3625,0.25,positive


In [88]:
! pip install geograpy3

Collecting geograpy3
  Downloading geograpy3-0.1.24-py3-none-any.whl (1.5 MB)
Collecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl (211 kB)
Collecting jellyfish
  Downloading jellyfish-0.8.2.tar.gz (134 kB)
Collecting pylodstorage
  Downloading pylodstorage-0.0.26-py3-none-any.whl (25 kB)
Collecting pycountry
  Downloading pycountry-20.7.3.tar.gz (10.1 MB)
Collecting feedparser>=5.2.1
  Downloading feedparser-6.0.2-py3-none-any.whl (80 kB)
Collecting jieba3k>=0.35.1
  Downloading jieba3k-0.35.1.zip (7.4 MB)
Collecting tldextract>=2.0.1
  Downloading tldextract-3.1.0-py2.py3-none-any.whl (87 kB)
Collecting cssselect>=0.9.2
  Downloading cssselect-1.1.0-py2.py3-none-any.whl (16 kB)
Collecting feedfinder2>=0.0.4
  Downloading feedfinder2-0.0.4.tar.gz (3.3 kB)
Collecting tinysegmenter==0.3
  Downloading tinysegmenter-0.3.tar.gz (16 kB)
Collecting SPARQLWrapper
  Downloading SPARQLWrapper-1.8.5-py3-none-any.whl (26 kB)
Collecting sgmllib3k
  Downloading sgmllib3k-1.0.0.ta

In [89]:
import geograpy3

ModuleNotFoundError: No module named 'geograpy3'

In [90]:
from geograpy3 import places

ModuleNotFoundError: No module named 'geograpy3'

In [None]:
from geograpy3 import extraction

In [None]:
e = extraction.Extractor(url = 'http://www.bbc.com/news/world-europe-26919928')
e.find_entities()

# You can now access all of the places found by the Extractor
print(e.places)

In [None]:
def Geoparse(txt):
    if txt != '':
        places = geograpy3.get_place_context(text = txt)
        # return places.regions
        # return places.address_strings not this one at all 
    else:
        return '0'
    
def Geoparse2(txt): 
    if txt != '':
        loc = places.PlaceContext(txt)
        loc.set_countries()
        country = loc.countries
        
        loc.set_regions()
        region = loc.regions
       
        return region, country
    else:
        return '0'

def Geoparse3(txt):
    if txt != '':
        place = extraction.Extractor(text = txt)
        place.find_entities()
        return place.places
    else:
        return '0'
    

In [98]:
#_df['new_location'] = _df['location'].apply(Geoparse3)
_df['newer_location'] = _df['new_location'].apply(Geoparse2)
_df

Unnamed: 0,created,id_str,text,user_id,screen_name,location,followers_count,user_favourites_count,time_zone,geo_enabled,...,status_count,geo,coords,retweet_count,tweet_favourite_count,Subjectivity,Polarity,Sentiment,new_location,newer_location
0,2021-03-11 15:21:24,1370032112485666818,angelovalidiya: I am asking again are we lab r...,825712966683262978,necas55,,440,10774,,True,...,15868,,,128,0,0.0,0.0,neutral,0,"([0], [Côte d'Ivoire])"
1,2021-03-11 15:21:24,1370032112447987715,POTUS: One in four adults in the U.S. has rece...,259461117,Amanda_L_Smith,"Warren, MI",2588,71805,,True,...,13890,,,16536,0,0.333333,0.25,positive,"[Warren, MI]","([Warren, MI], [Warren, MI])"
2,2021-03-11 15:21:24,1370032112175304705,POTUS: One in four adults in the U.S. has rece...,20353953,GoKTGo,"New York, NY",2606,18461,,True,...,20052,,,16536,0,0.333333,0.25,positive,"[New York, NY]","([NY, New York], [NY, New York])"
3,2021-03-11 15:21:24,1370032112074625025,"CNNPolitics: In a new ad campaign, former Pres...",1239283251950374912,JC38054211,"Coconut Grove, FL",77,1337,,False,...,276,,,595,0,0.138636,0.034091,positive,"[Coconut, Grove, FL]","([Grove, Coconut, FL], [Grove, Coconut, FL])"
4,2021-03-11 15:21:23,1370032111319724036,Big pharma monopolies lead to countries waitin...,580808293,Rebeccaxlcr,,64,388,,False,...,8280,,,0,0,0.1,0.0,neutral,0,"([0], [Côte d'Ivoire])"
5,2021-03-11 15:21:23,1370032111239917568,Pfizer Shot 97% Effective Against Symptomatic ...,453857290,ThoughtSow,K-Pac,4324,4690,,True,...,24899,,,0,0,0.8,0.6,positive,[],"([], [])"
6,2021-03-11 15:21:23,1370032110757683204,Cold_Peace_ JamesSchwemlein jamescrabtree Rman...,2536510904,EvanFeigenbaum,"Washington, DC",14186,18971,,True,...,47365,,,0,2,1.0,0.0,neutral,"[Washington, DC]","([DC, Washington], [DC, Washington])"
7,2021-03-11 15:21:23,1370032110464077831,Kroger clinic patients given empty COVID-19 va...,251130012,joschroweArt,🌎🌏🌍,1100,60458,,False,...,20942,,,0,0,0.5,-0.1,negative,[],"([], [])"
8,2021-03-11 15:21:23,1370032109893672965,mariamainmo: Girls don't want boys. They want ...,223149595,moyenmoins,"Paris, France",284,47587,,False,...,7705,,,25410,0,1.0,0.6,positive,"[Paris, France]","([Paris, France], [Paris, France])"
9,2021-03-11 15:21:23,1370032108870197248,TheElders: “We must act with collective respon...,233041292,niawag1,washington dc,3766,67537,,False,...,12408,,,70,0,0.3625,0.25,positive,[],"([], [])"


In [13]:
#_df = pd.read_csv('csvs/tw150.csv')
#_df = _df.iloc[:15]
#_df = _df[['id_str', 'location']]
_df

Unnamed: 0,id_str,location
0,1370033716316282880,"Springfield, MO"
1,1370033714982445059,London Ontario
2,1370033714411937801,
3,1370033714395291664,
4,1370033713304731651,
5,1370033712746946560,Manhattan & Philly
6,1370033711534723075,"Houston, Texas"
7,1370033711442497542,Ohio
8,1370033711316688899,
9,1370033710389747713,THE KINGDOM OF GOD IN EARTH


In [21]:
from os import listdir

In [22]:
filepaths = ['csvs/' + f for f in listdir("csvs")]
df = pd.concat(map(pd.read_csv, filepaths))
df = df.drop_duplicates(subset=['id_str'])
len(df)

232058

In [23]:
df = df.reset_index()
df = df.drop(columns=['index'])
df.tail(1)

Unnamed: 0,created,id_str,text,user_id,screen_name,location,followers_count,user_favourites_count,time_zone,geo_enabled,verified,status_count,geo,coords,retweet_count,tweet_favourite_count,Subjectivity,Polarity,Sentiment
232057,2021-03-15 00:02:57,1371250530123718660,Found my old WHO vaccine cert booklet from whe...,92157908,Juliagoolia1982,Anywhere but here,761,16382,,True,False,13107,,,0,0,0.2,0.1,positive


In [48]:
len(df)

232058

In [67]:
loc_df.to_csv('loc.csv', index=False, encoding='UTF-8')

In [60]:
df.to_csv('tweets.csv', index=False, encoding='UTF-8')

In [53]:
known_cords = df['coords'].unique()
len(known_cords)

259

In [59]:
loc_df = df
loc_df = loc_df.dropna(subset=['location'])
len(loc_df)

159149

In [61]:
loc_df = loc_df[['created','id_str','text','screen_name','location','followers_count','user_favourites_count','status_count','retweet_count','tweet_favourite_count','Subjectivity','Polarity','Sentiment']]
#loc_df = loc_df.reset_index()

In [None]:
loc_df['new_location'] = loc_df['location'].apply(Geoparse3)
loc_df.head()

# SQLAlchemy
### Connecting to a certain other host 

In [41]:
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine

In [73]:
engine = create_engine(conn2, echo=False) 

In [74]:
Base = automap_base()
Base.prepare(engine, reflect=True)  

In [75]:
# Test connection
Base.classes.keys()

[]

# Creating the Tables
These are created to minimize the time needed to pull information from the database


In [None]:
# Index dataframe 
index_df = df[['id_str', 'created']]

In [77]:
# Popularity dataframe
popularity_df = df[['id_str','created','Sentiment','Polarity','retweet_count','tweet_favourite_count']]

### Don't run yet
Clean up your tweets! 

In [78]:
def AddTable(df, table_name):
    df.to_sql(name=table_name, con=engine, if_exists='append', index=True)
    print('added!')

In [79]:
AddTable(popularity_df, 'popularity')

added!
