In [1]:
# trouble shooting
import sys
sys.path.append('/opt/anaconda3/lib/python3.8/site-packages')

In [2]:
# Run the pip install command below if you don't already have the library
# !pip3 install git+https://github.com/JustAnotherArchivist/snscrape.git
import os
import math
import pandas as pd
import json
from pandas import json_normalize
from datetime import datetime
from random import randrange
import shutil
import itertools
import snscrape.modules.twitter as sntwitter

In [3]:
# Search terms: Content in the tweet
search_terms = [
    'coronavirus',
    'covid19',
    'pandemic',
    'quarantine',
]

# Creates a search parameter for the Twitter scraper
# since_date: inclusive
# until_date: exclusive
def search_builder(terms, area, since_date, until_date):    
    s = ''
    # Concatenate all search terms
    for i in range(len(terms) - 1):
        # NOTE: Inclusive OR statement
        s += '\"' + terms[i] + '\" OR '
    s += '\"' + terms[len(terms) - 1] + '\" '    
    
    # Within or the near the area
    s += 'near:\"' + area + '\" '
    # Since the first date
    s += 'since:' + since_date + ' '
    # Until the second date
    s += 'until:' + until_date
    
    return s

# Returns a DataFrame with at most tweet_limit tweets resulting from
# the search
def scrape(search_param, tweet_limit):
    return pd.DataFrame(itertools.islice(
        sntwitter.TwitterSearchScraper(search_param).get_items(), tweet_limit))

# Search for tweets in Los Angeles mentioning COVID terms
# from the beginning to the end of 2020.
los_angeles_search = search_builder(
    search_terms,
    "Los Angeles",
    '2020-01-01',
    '2020-12-31')

print(los_angeles_search)

"coronavirus" OR "covid19" OR "pandemic" OR "quarantine" near:"Los Angeles" since:2020-01-01 until:2020-12-31


In [4]:
# ********* STATE TWEET SCRAPING *********

def state_scraper(state, abbr, limit, since_date, until_date):

    total_tweets = 0

    print("Compiling tweets (max <=" + str(limit) + ") from " + state + " counties from " + since_date + " to " + until_date)

    print("START: " + str(datetime.now()))

    counties = pd.read_csv("data/counties/" + abbr + "_counties.csv")

    for i in range(len(counties)):

        county = counties.iloc[i]['County']
        county.strip()

        county_search = search_builder(
            search_terms,
            county + " County, " + abbr,
            since_date,
            until_date)

        county_tweets = scrape(county_search, limit)

        total_tweets = total_tweets + len(county_tweets)

        # Generates JSON with respective county tweet data
        county_json = county_tweets.to_json(county + ".json", lines=True, orient='records')

        source = os.getcwd() + "/" + county + ".json"
        destination = os.getcwd() + "/data/tweets/" + abbr + "_tweets/" + county + ".json"
        shutil.move(source, destination)

        print("(" + str(i + 1) + "/" + str(len(counties)) + ") " + county + " - " + str(len(county_tweets)) + " tweets: " + str(datetime.now()))

    print("END: " + str(datetime.now()))
    print(str(total_tweets) + " tweets processed")

    
# California Tweet Scraper
def ca_scraper(tweet_limit, since_date, until_date):
    state_scraper("California", "CA", tweet_limit, since_date, until_date)

# Florida Tweet Scraper
def fl_scraper(tweet_limit, since_date, until_date):
    state_scraper("Florida", "FL", tweet_limit, since_date, until_date)

# New York Tweet Scraper
def ny_scraper(tweet_limit, since_date, until_date):
    state_scraper("New York", "NY", tweet_limit, since_date, until_date)
    
# Texas Tweet Scraper
def tx_scraper(tweet_limit, since_date, until_date):
    state_scraper("Texas", "TX", tweet_limit, since_date, until_date)

In [7]:
ca_scraper(1000, "2021-10-01", "2021-10-09")

Compiling tweets (max <=1000) from California counties from 2021-10-01 to 2021-10-09
START: 2021-11-14 20:25:57.702275
(1/58) Alameda - 20 tweets: 2021-11-14 20:25:59.838410
(2/58) Alpine - 1000 tweets: 2021-11-14 20:26:25.077313
(3/58) Amador - 0 tweets: 2021-11-14 20:26:27.809523
(4/58) Butte - 1 tweets: 2021-11-14 20:26:29.364359
(5/58) Calaveras - 0 tweets: 2021-11-14 20:26:30.922107
(6/58) Colusa - 1000 tweets: 2021-11-14 20:26:55.546899
(7/58) Contra Costa - 19 tweets: 2021-11-14 20:26:57.655577
(8/58) Del Norte - 0 tweets: 2021-11-14 20:26:58.934344
(9/58) El Dorado - 0 tweets: 2021-11-14 20:27:00.200795
(10/58) Fresno - 7 tweets: 2021-11-14 20:27:01.720994
(11/58) Glenn - 1000 tweets: 2021-11-14 20:27:27.286553
(12/58) Humboldt - 3 tweets: 2021-11-14 20:27:28.853507
(13/58) Imperial - 0 tweets: 2021-11-14 20:27:31.428107
(14/58) Inyo - 1000 tweets: 2021-11-14 20:27:58.724922
(15/58) Kern - 0 tweets: 2021-11-14 20:28:00.353724
(16/58) Kings - 0 tweets: 2021-11-14 20:28:01.846583

In [10]:
fl_scraper(1000, "2021-10-01", "2021-10-09")

Compiling tweets (max <=1000) from Florida counties from 2021-10-01 to 2021-10-09
START: 2021-11-03 17:26:07.622783
(1/67) Alachua - 2 tweets: 2021-11-03 17:26:09.265687
(2/67) Baker - 0 tweets: 2021-11-03 17:26:10.791695
(3/67) Bay - 3 tweets: 2021-11-03 17:26:12.775998
(4/67) Bradford - 2 tweets: 2021-11-03 17:26:14.464591
(5/67) Brevard - 2 tweets: 2021-11-03 17:26:16.398375
(6/67) Broward - 27 tweets: 2021-11-03 17:26:19.583758
(7/67) Calhoun - 0 tweets: 2021-11-03 17:26:20.894507
(8/67) Charlotte - 1 tweets: 2021-11-03 17:26:23.065595
(9/67) Citrus - 1 tweets: 2021-11-03 17:26:24.693889
(10/67) Clay - 3 tweets: 2021-11-03 17:26:26.176432
(11/67) Collier - 4 tweets: 2021-11-03 17:26:28.790570
(12/67) Columbia - 0 tweets: 2021-11-03 17:26:30.330622
(13/67) De Soto - 1000 tweets: 2021-11-03 17:27:00.636094
(14/67) Dixie - 0 tweets: 2021-11-03 17:27:02.107284
(15/67) Duval - 1 tweets: 2021-11-03 17:27:03.424490
(16/67) Escambia - 0 tweets: 2021-11-03 17:27:05.739998
(17/67) Flagler - 

In [11]:
ny_scraper(1000, "2021-10-01", "2021-10-09")

Compiling tweets (max <=1000) from New York counties from 2021-10-01 to 2021-10-09
START: 2021-11-03 17:34:27.758192
(1/62) Albany - 2 tweets: 2021-11-03 17:34:29.318768
(2/62) Allegany - 0 tweets: 2021-11-03 17:34:30.926130
(3/62) Bronx - 418 tweets: 2021-11-03 17:34:52.586070
(4/62) Broome - 0 tweets: 2021-11-03 17:34:54.832494
(5/62) Cattaraugus - 1000 tweets: 2021-11-03 17:35:20.739375
(6/62) Cayuga - 1000 tweets: 2021-11-03 17:35:45.573533
(7/62) Chautauqua - 1 tweets: 2021-11-03 17:35:47.165005
(8/62) Chemung - 1000 tweets: 2021-11-03 17:36:13.055069
(9/62) Chenango - 1000 tweets: 2021-11-03 17:36:41.205264
(10/62) Clinton - 0 tweets: 2021-11-03 17:36:42.543747
(11/62) Columbia - 0 tweets: 2021-11-03 17:36:44.435576
(12/62) Cortland - 0 tweets: 2021-11-03 17:36:46.811955
(13/62) Delaware - 0 tweets: 2021-11-03 17:36:48.351194
(14/62) Dutchess - 6 tweets: 2021-11-03 17:36:49.935045
(15/62) Erie - 11 tweets: 2021-11-03 17:36:52.410984
(16/62) Essex - 1000 tweets: 2021-11-03 17:37:2

In [12]:
tx_scraper(1000, "2021-10-01", "2021-10-09")

Compiling tweets (max <=1000) from Texas counties from 2021-10-01 to 2021-10-09
START: 2021-11-03 17:43:45.123860
(1/254) Anderson - 1000 tweets: 2021-11-03 17:44:15.795531
(2/254) Andrews - 1000 tweets: 2021-11-03 17:44:46.879523
(3/254) Angelina - 1000 tweets: 2021-11-03 17:45:19.389715
(4/254) Aransas - 1000 tweets: 2021-11-03 17:45:48.753415
(5/254) Archer - 1000 tweets: 2021-11-03 17:46:20.702523
(6/254) Armstrong - 1000 tweets: 2021-11-03 17:46:51.709236
(7/254) Atascosa - 1000 tweets: 2021-11-03 17:47:20.125059
(8/254) Austin - 1000 tweets: 2021-11-03 17:47:47.993429
(9/254) Bailey - 1000 tweets: 2021-11-03 17:48:14.478952
(10/254) Bandera - 1000 tweets: 2021-11-03 17:48:45.317235
(11/254) Bastrop - 0 tweets: 2021-11-03 17:48:49.161300
(12/254) Baylor - 1000 tweets: 2021-11-03 17:49:18.432394
(13/254) Bee - 1000 tweets: 2021-11-03 17:49:48.306897
(14/254) Bell - 3 tweets: 2021-11-03 17:49:50.241391
(15/254) Bexar - 33 tweets: 2021-11-03 17:49:52.918358
(16/254) Blanco - 1000 twe

(140/254) Lamb - 1000 tweets: 2021-11-03 18:32:58.876716
(141/254) Lampasas - 1000 tweets: 2021-11-03 18:33:28.631924
(142/254) La Salle - 1000 tweets: 2021-11-03 18:33:58.244625
(143/254) Lavaca - 1000 tweets: 2021-11-03 18:34:29.236692
(144/254) Lee - 1000 tweets: 2021-11-03 18:35:00.453108
(145/254) Leon - 1000 tweets: 2021-11-03 18:35:29.762297
(146/254) Liberty - 0 tweets: 2021-11-03 18:35:34.621731
(147/254) Limestone - 0 tweets: 2021-11-03 18:35:35.925796
(148/254) Lipscomb - 1000 tweets: 2021-11-03 18:36:07.544953
(149/254) Live Oak - 1000 tweets: 2021-11-03 18:36:36.761767
(150/254) Llano - 1000 tweets: 2021-11-03 18:37:07.972222
(151/254) Loving - 0 tweets: 2021-11-03 18:37:09.207938
(152/254) Lubbock - 1000 tweets: 2021-11-03 18:37:36.735332
(153/254) Lynn - 1000 tweets: 2021-11-03 18:38:03.500519
(154/254) Madison - 1000 tweets: 2021-11-03 18:38:29.014927
(155/254) Marion - 1000 tweets: 2021-11-03 18:38:53.948952
(156/254) Martin - 0 tweets: 2021-11-03 18:38:55.475601
(157/

# References

California Counties: https://www.mapsofworld.com/usa/states/california/california-county-map.html

Florida Counties: https://www.mapsofworld.com/usa/states/florida/florida-county-map.html

New York Counties: https://www.mapsofworld.com/usa/states/new-york/new-york-county-map.html

Texas Counties: https://www.mapsofworld.com/usa/states/texas/texas-county-map.html

## Sample DataFrame Format and Machine Learning

In [5]:
import pandas as pd

In [6]:
marin_temp = pd.read_json("data/tweets/ca_tweets/Marin.json", lines = True)
marin = marin_temp.drop(columns={'url', 'date', 'content', 'id', 'user', 'replyCount',
       'retweetCount', 'likeCount', 'quoteCount', 'conversationId', 'lang',
       'source', 'sourceUrl', 'sourceLabel', 'outlinks', 'tcooutlinks',
       'media', 'retweetedTweet', 'quotedTweet', 'inReplyToTweetId',
       'inReplyToUser', 'mentionedUsers', 'coordinates', 'place',
       'cashtags'})
# apply filter method to find out tone of tweet, for brevity consider the temporary column 'tone'
marin['tone'] = [3, 4, 5, 2, 2, 4, 2, 1]

In [7]:
solano_temp = pd.read_json("data/tweets/ca_tweets/Solano.json", lines = True)
solano = solano_temp.drop(columns={'url', 'date', 'content', 'id', 'user', 'replyCount',
       'retweetCount', 'likeCount', 'quoteCount', 'conversationId', 'lang',
       'source', 'sourceUrl', 'sourceLabel', 'outlinks', 'tcooutlinks',
       'media', 'retweetedTweet', 'quotedTweet', 'inReplyToTweetId',
       'inReplyToUser', 'mentionedUsers', 'coordinates', 'place',
       'cashtags'})
# apply filter method to find out tone of tweet, for brevity consider the temporary column 'tone'
solano['tone'] = [3, 3]

In [8]:
ventura_temp = pd.read_json("data/tweets/ca_tweets/Ventura.json", lines = True)
ventura = ventura_temp.drop(columns={'url', 'date', 'content', 'id', 'user', 'replyCount',
       'retweetCount', 'likeCount', 'quoteCount', 'conversationId', 'lang',
       'source', 'sourceUrl', 'sourceLabel', 'outlinks', 'tcooutlinks',
       'media', 'retweetedTweet', 'quotedTweet', 'inReplyToTweetId',
       'inReplyToUser', 'mentionedUsers', 'coordinates', 'place',
       'cashtags'})
# apply filter method to find out tone of tweet, for brevity consider the temporary column 'tone'
ventura['tone'] = [4, 2, 1]

In [9]:
humboldt_temp = pd.read_json("data/tweets/ca_tweets/Humboldt.json", lines = True)
humboldt = humboldt_temp.drop(columns={'url', 'date', 'content', 'id', 'user', 'replyCount',
       'retweetCount', 'likeCount', 'quoteCount', 'conversationId', 'lang',
       'source', 'sourceUrl', 'sourceLabel', 'outlinks', 'tcooutlinks',
       'media', 'retweetedTweet', 'quotedTweet', 'inReplyToTweetId',
       'inReplyToUser', 'mentionedUsers', 'coordinates', 'place',
       'cashtags'})
# apply filter method to find out tone of tweet, for brevity consider the temporary column 'tone'
humboldt['tone'] = [2, 4, 1]

In [10]:
ca_df = pd.DataFrame([marin['tone'].mean(), solano['tone'].mean(), ventura['tone'].mean()], columns={"tone"})
ca_df['outbreak'] = [3, 2, 3]

In [11]:
from sklearn.linear_model import LogisticRegression
import numpy as np
X_train = np.array(ca_df['tone']).reshape(-1, 1)
y_train = ca_df['outbreak']
X_test = np.array(humboldt['tone'].mean()).reshape(1, -1)
model = LogisticRegression(penalty='none').fit(X_train, y_train)
pred = model.predict(X_test)
print(pred) # produces 3: an outbreak is predicted to occur in Humboldt

[3]


### More dataframe work OPTION 1

In [12]:
import os
import glob

In [13]:
drop_cols = {'url','date','content','id','user','replyCount','retweetCount','likeCount','quoteCount','conversationId', 'lang','source', 'sourceUrl', 'sourceLabel', 'outlinks', 'tcooutlinks','media', 'retweetedTweet', 'quotedTweet', 'inReplyToTweetId','inReplyToUser', 'mentionedUsers', 'coordinates', 'place','cashtags'}

In [14]:
def json_agg(path):
    json_files = glob.glob(os.path.join(path, "*.json"))
    return json_files

In [15]:
def process_data(paths):
    total = pd.DataFrame()
    for path in paths:
        temp = pd.read_json(path, lines=True)
        #new_temp = temp.drop(columns=drop_cols, axis=0)
        #print(type(temp))
        new_temp = temp.drop(columns={'url', 'date', 'content', 'id', 'user', 'replyCount','retweetCount', 'likeCount', 'quoteCount', 'conversationId', 'lang','source', 'sourceUrl', 'sourceLabel', 'outlinks', 'tcooutlinks','media', 'retweetedTweet', 'quotedTweet', 'inReplyToTweetId','inReplyToUser', 'mentionedUsers', 'coordinates', 'place','cashtags'})
        #print(temp)
        total = total.append(temp)
    return total

In [16]:
path_source = os.getcwd()
ca_tweets = path_source + '/data/tweets/ca_tweets'
fl_tweets = path_source + '/data/tweets/fl_tweets'
ny_tweets = path_source + '/data/tweets/ny_tweets'
tx_tweets = path_source + '/data/tweets/tx_tweets'

In [17]:
# PROCESS CALIFORNIA
ca_paths = json_agg(ca_tweets)
ca_ = process_data(ca_paths)

KeyError: "['coordinates' 'id' 'content' 'sourceLabel' 'inReplyToTweetId'\n 'replyCount' 'sourceUrl' 'quoteCount' 'date' 'tcooutlinks' 'url' 'media'\n 'mentionedUsers' 'inReplyToUser' 'likeCount' 'conversationId'\n 'retweetCount' 'user' 'lang' 'place' 'outlinks' 'source' 'cashtags'\n 'retweetedTweet' 'quotedTweet'] not found in axis"

In [18]:
ca_.shape

NameError: name 'ca_' is not defined

In [19]:
for file in json_files:
    temppd.read_json(file, lines=True)

NameError: name 'json_files' is not defined

In [20]:
type(json_files)

NameError: name 'json_files' is not defined

In [21]:
path_source = os.getcwd()
ca_path = path_source + '/data/tweets/ca_tweets'

In [22]:
print(ca_tweets)

/Users/Justin/github/The-Processors/ics-438-final-project/data/tweets/ca_tweets


### Work based on asignment 1

In [23]:
# sample_record = json.load(open("data/record.json"))
# NOTE: CODE MISSING FINISH THIS
from collections import Counter

var_types = []
tax_ids = []
record = None
i = 0
for line in open("data_report.jsonl"):
    record = json.loads(line)
    i+=1
        ...sentiment analysis?
#Counter(sentiment_types)

IndentationError: unexpected indent (<ipython-input-23-893b9d2debbf>, line 12)

### More models and evaluation

In [26]:
from sklearn import svm

In [None]:
# fix model and evaluation metrics

In [25]:
from sklearn.cluster import KMeans

In [27]:
# fix model and evaluation metrics