### Import Libraries

In [120]:
# Import twitter dependencies
import tweepy
from config import *

In [121]:
# Import transformation dependencies
import pandas as pd
from datetime import datetime, date
import json
import re 

In [122]:
# For sentiment and subjectivity analysis
from textblob import TextBlob

# Tweepy Setup

In [123]:
# Tweepy Setup
auth = tweepy.OAuthHandler(api_key, api_secret_key)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)

In [201]:
# Tweepy search parameters
search_terms = ['covid+vaccine OR covid-19+vaccine OR pfizer OR pfizer-biontech OR pfizerbiontech OR moderna OR astrazeneca OR astra+zeneca -filter:retweets']
end_date = date(2021,3,11)
max_id ='1370122611116625923'
limit = 1200 # this is my 15 minute limit :'( breaks if this becomes 1000

# Save files
output_csv ='csvs/tw60.csv'
output_json ='json/raw60.json'

# Functions

Collect and save data

In [202]:
# Pulls data from api and returns 2 lists:
    # one which is parsed for selected variables
    # one with all the raw data

data = []
raw = []
tweets = tweepy.Cursor(api.search, location='canada', max_id = max_id, place='canada', lang ='en', q=search_terms, tweet_mode='extended').items(limit)
for t in tweets: 
    tweet ={
        'created' :t.created_at,
        'id_str' : t.id_str,
        'text' : t.full_text,
        'user_id' :t.user.id,
        'screen_name': t.user.screen_name,
        'location': t.user.location,
        'followers_count': t.user.followers_count,
        'user_favourites_count': t.user.favourites_count,
        'time_zone' :t.user.time_zone,
        'geo_enabled' :t.user.geo_enabled,
        'verified' :t.user.verified,
        'status_count': t.user.statuses_count,
        'geo':t.geo,
        'coords':t.coordinates,
        'retweet_count': t.retweet_count,
        'tweet_favourite_count' : t.favorite_count
    }
    # add sorted data to one list 
    data.append(tweet)
    # keep raw data in case I realize later that I've removed something important 
    raw.append(t)
print('done!')

done!


In [203]:
# Add parsed data to dataframe, convert to csv
tweets_df = pd.DataFrame(data)

# Save raw data to json 
with open(output_json, 'w') as file:
    for resp in raw:
        jso = json.dumps(resp._json,sort_keys=True, indent=3)
        file.write(jso)
tweets_df.tail(5)

Unnamed: 0,created,id_str,text,user_id,screen_name,location,followers_count,user_favourites_count,time_zone,geo_enabled,verified,status_count,geo,coords,retweet_count,tweet_favourite_count
1195,2021-03-11 21:16:04,1370121367899795467,RT @ddiamond: Trump officials last year concei...,292444688,MikeGeorgeCBS,New York City,4323,3091,,True,False,29474,,,64,0
1196,2021-03-11 21:16:04,1370121367010562060,RT @bopanc: Serbia to start manufacturing Sino...,313510104,LostinEU,Brüssel,3287,34351,,True,False,45868,,,13,0
1197,2021-03-11 21:16:04,1370121366415020032,RT @NoContextTFs: How it feels getting the COV...,99514242,alsoMike,Sweden,1009,209013,,False,False,259378,,,106,0
1198,2021-03-11 21:16:03,1370121366142382082,The retailer is in talks with the CDC about de...,60072110,dailyyonder,Rural America,7342,1366,,True,False,3837,,,9,12
1199,2021-03-11 21:16:03,1370121363994841092,Nigeria Did Not Receive Batch Of AstraZeneca V...,594960763,MojiDelanoBlog,"Lagos,NIGERIA",46476,47774,,True,False,112558,,,8,9


### Clean tweets

In [204]:
# not working when I put these into a function
tweets_df = tweets_df.drop_duplicates(subset=['id_str'])
tweets_df = tweets_df.dropna(subset=['id_str', 'screen_name'])

In [205]:
# Makes tweets more readable 
def CleanTweets(txt): 
    # remove @ username 
    # txt = re.sub(r'@[A-Za-z0-9)]+','', txt)

    # remove @ from username
    txt = txt.replace('@','')    
    
    # remove RT    
    txt = re.sub(r'RT[\s]+','',txt)
    
    # remove # but leave txt
    txt = re.sub(r'#','', txt)
    
    # remove hyperlinks
    txt = re.sub(r'https?:\/\/\S+', '', txt)
    return txt

In [206]:
# Clean the text field 
tweets_df['text'] = tweets_df['text'].apply(CleanTweets)

In [207]:
# to get subjectivity 
def RateSubjectivity(txt):
    return TextBlob(txt).sentiment.subjectivity

In [208]:
# to get polarity
def RatePolarity(txt):
    return TextBlob(txt).sentiment.polarity

In [209]:
# add worded sentiment based on polarity score
def GetSentiment(num):
    if num < 0: 
        return 'negative' 
    elif num == 0:
        return 'neutral'
    else:
        return 'positive'

In [210]:
# Create Columns
def RateTweets(df):
    df['Subjectivity'] = df['text'].apply(RateSubjectivity)
    df['Polarity'] = df['text'].apply(RatePolarity)
    df['Sentiment'] = df['Polarity'].apply(GetSentiment)
    return df 

In [211]:
tweets_df = RateTweets(tweets_df)
tweets_df.to_csv(output_csv, index=False, encoding='UTF-8')
tweets_df.tail(5)

Unnamed: 0,created,id_str,text,user_id,screen_name,location,followers_count,user_favourites_count,time_zone,geo_enabled,verified,status_count,geo,coords,retweet_count,tweet_favourite_count,Subjectivity,Polarity,Sentiment
1195,2021-03-11 21:16:04,1370121367899795467,ddiamond: Trump officials last year conceived ...,292444688,MikeGeorgeCBS,New York City,4323,3091,,True,False,29474,,,64,0,0.083333,-0.05,negative
1196,2021-03-11 21:16:04,1370121367010562060,bopanc: Serbia to start manufacturing Sinophar...,313510104,LostinEU,Brüssel,3287,34351,,True,False,45868,,,13,0,0.0,0.0,neutral
1197,2021-03-11 21:16:04,1370121366415020032,NoContextTFs: How it feels getting the COVID v...,99514242,alsoMike,Sweden,1009,209013,,False,False,259378,,,106,0,0.0,0.0,neutral
1198,2021-03-11 21:16:03,1370121366142382082,The retailer is in talks with the CDC about de...,60072110,dailyyonder,Rural America,7342,1366,,True,False,3837,,,9,12,0.5,0.136364,positive
1199,2021-03-11 21:16:03,1370121363994841092,Nigeria Did Not Receive Batch Of AstraZeneca V...,594960763,MojiDelanoBlog,"Lagos,NIGERIA",46476,47774,,True,False,112558,,,8,9,0.0,0.0,neutral


In [213]:
tweets_df[['created','id_str','text','screen_name','location','Subjectivity','Polarity','Sentiment']].tail(5)

Unnamed: 0,created,id_str,text,screen_name,location,Subjectivity,Polarity,Sentiment
1195,2021-03-11 21:16:04,1370121367899795467,ddiamond: Trump officials last year conceived ...,MikeGeorgeCBS,New York City,0.083333,-0.05,negative
1196,2021-03-11 21:16:04,1370121367010562060,bopanc: Serbia to start manufacturing Sinophar...,LostinEU,Brüssel,0.0,0.0,neutral
1197,2021-03-11 21:16:04,1370121366415020032,NoContextTFs: How it feels getting the COVID v...,alsoMike,Sweden,0.0,0.0,neutral
1198,2021-03-11 21:16:03,1370121366142382082,The retailer is in talks with the CDC about de...,dailyyonder,Rural America,0.5,0.136364,positive
1199,2021-03-11 21:16:03,1370121363994841092,Nigeria Did Not Receive Batch Of AstraZeneca V...,MojiDelanoBlog,"Lagos,NIGERIA",0.0,0.0,neutral


# SQLAlchemy
### Connecting to a certain other host 

In [5]:
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine

In [12]:
engine = create_engine(conn, echo=False)

In [13]:
Base = automap_base()
Base.prepare(engine, reflect=True)  

In [14]:
# Test connection
# it works!!!
# not anymore
# kill kernal if conn is wrong

Base.classes.keys()

['tw2']

### Don't run yet
Clean up your tweets! 

In [9]:
_df = pd.read_csv('csvs/tw2.csv')
_df.to_sql(name='tw2', con=engine, if_exists='append', index=True)
print('added!')

added!


In [None]:
# session things
session = Session(engine)
session.close()
