In [1]:
from twitterscraper import query_tweets
import codecs, json
import pandas as pd
import numpy as np
import os
import subprocess
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score

In [2]:
tweets = pd.read_csv('./tweets.csv')
tweets.head()

Unnamed: 0.1,Unnamed: 0,fullname,html,id,likes,replies,retweets,text,timestamp,url,user,latitude,longitude
0,0,Arijan Ramku 🌹,"<p class=""TweetTextSize js-tweet-text tweet-te...",263430003797151746,0.0,0.0,1.0,Staten Island #Sandy pic.twitter.com/h0gFWKvp,2012-10-30T23:59:42,/ariramku/status/263430003797151746,ariramku,40.601962,-73.973578
1,1,мια,"<p class=""TweetTextSize js-tweet-text tweet-te...",263429531082293248,0.0,0.0,0.0,24 hours w/out power. #Sandy,2012-10-30T23:57:49,/miasolx/status/263429531082293248,miasolx,40.601962,-73.973578
2,2,Deanna,"<p class=""TweetTextSize js-tweet-text tweet-te...",263426428157173760,0.0,0.0,0.0,heading home from Sheepshead bay Cleaning stor...,2012-10-30T23:45:29,/Dbitetti/status/263426428157173760,Dbitetti,40.601962,-73.973578
3,3,William Bright,"<p class=""TweetTextSize js-tweet-text tweet-te...",263426306811756545,0.0,0.0,0.0,Nathan's Famous Take Home Food. But no power. ...,2012-10-30T23:45:00,/brightfactor/status/263426306811756545,brightfactor,40.601962,-73.973578
4,4,William Bright,"<p class=""TweetTextSize js-tweet-text tweet-te...",263425970713817088,0.0,0.0,0.0,The Cyclone. Coney Island is still without pow...,2012-10-30T23:43:40,/brightfactor/status/263425970713817088,brightfactor,40.601962,-73.973578


In [3]:
tweets.shape

(10537, 13)

In [4]:
# converting the text of all the twitter posts to strings and to lowercase
tweets['text'] = [str(doc).lower() for doc in tweets['text']]

In [5]:
# checking the result
tweets['text'].head()

0        staten island #sandy pic.twitter.com/h0gfwkvp
1                         24 hours w/out power. #sandy
2    heading home from sheepshead bay cleaning stor...
3    nathan's famous take home food. but no power. ...
4    the cyclone. coney island is still without pow...
Name: text, dtype: object

In [6]:
# instantiating a count vectorizer with ngrams up to 2
cvec = CountVectorizer(ngram_range=(1,2))

In [7]:
# fitting and transforming the twitter posts with the count vectorizer
X = cvec.fit_transform(tweets['text'])

In [8]:
# converting the result to a dataframe
cvec_df = pd.DataFrame(X.toarray(),
                   columns=cvec.get_feature_names())

In [9]:
# checking this was done correctly
cvec_df.head()

Unnamed: 0,00,00 am,00 and,00 curfew,00 et,00 guess,00 the,00 tomorrow,000,000 000,...,제대로된,제대로된 태풍,진입안하고,진입안하고 왼쪽으로,태풍,태풍 오지도않았는데,허드슨리버쪽,허드슨리버쪽 동네는,허리케인,허리케인 ㅠㅠ
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
cvec_df.shape

(10537, 96396)

In [11]:
# instantiating a DBSCAN model with default parameters
db = DBSCAN()

In [12]:
# creating a list of words that will indicate danger from a hurricane
buzz_words = [
    'help', 'please help', 'severe', 'damage', 'flood', 'unsafe',
    'flooding', 'wind', 'power outage', 'danger', 'serious',
    'destroyed', 'killed', 'need help', 'dead', 'emergency',
    'major', 'major damage', 'help us', 'help me', 'lost power',
    'no power', 'no food', 'strong wind', 'stuck', 'trapped',
    'not safe', 'hurt', 'severe damage', 'destruction', 'in trouble',
    'freezing', "can't move", 'bad', 'really bad', 'awful', 'very bad',
    'dangerous', 'very dangerous', 'storm surge', 'heavy rain', 'damaging',
    'drown', 'crushed', 'without power', 'devastating', 'underwater', 'under water',
    'overwhelming', 'tree down', 'downed tree', 'destruction', 'leveled', 'knocked out',
    'need shelter', 'fire', 'on fire', 'fucked', 'debris', 'catastrophe', 
]

In [13]:
# checking each buzz word to make sure it appears at least once in the list of tweets
for word in buzz_words:
    try:
        cvec_df[word]
    
    # if it doesn't, remove it from the list so as not to throw an error in modeling
    except KeyError:
        buzz_words.remove(word)

In [14]:
# fitting the DBSCAN model to the specific words provided 
db.fit(cvec_df[buzz_words])

DBSCAN(algorithm='auto', eps=0.5, leaf_size=30, metric='euclidean',
    metric_params=None, min_samples=5, n_jobs=None, p=None)

In [15]:
# generating a column that designates which cluster each tweet was sorted into
tweets['danger'] = db.labels_

In [16]:
# checking the silhouette score
silhouette_score(cvec_df[buzz_words], db.labels_)

0.9844667057945236

Now that we have a way of measuring the amount of danger indicated by each tweet, we can take an average of all of the tweets in a given area to give us a general intensity of danger for each area.

In [17]:
# grouping all the tweets by the coordinate point which they were pulled from
# generating an intensity ranking for each area based on the cluster labels provided in the DBSCAN model
map_results = tweets.groupby(['latitude', 'longitude'])['danger'].mean()

In [18]:
# checking the result
map_results.head()

latitude   longitude 
38.518698  -75.495134    3.833333
38.518824  -75.667357    0.000000
38.519077  -76.011804    2.166667
38.519204  -76.184028    0.000000
38.653446  -75.495134    1.285714
Name: danger, dtype: float64

In [19]:
# converting the results to a dataframe
map_results_df = pd.DataFrame(map_results)
map_results_df.reset_index(inplace=True)
map_results_df.head()

Unnamed: 0,latitude,longitude,danger
0,38.518698,-75.495134,3.833333
1,38.518824,-75.667357,0.0
2,38.519077,-76.011804,2.166667
3,38.519204,-76.184028,0.0
4,38.653446,-75.495134,1.285714


Because we are going to need a set number of categories for ranking danger in order to create a map out of these results, we will need to turn the danger intensities into a discrete number of danger rankings. For our purposes, we determined that 5 categories from lower to higher damage would be ideal.

In [20]:
# a function to convert the danger intensity value to a number from 0 to 4
def discrete_results(val):
    
    # round the value to the nearest integer
    val = round(val)
    
    # group anything higher than 3 into the top category (4)
    if val >= 4:
        val = 4
    
    # return the discrete value
    return val

In [21]:
# implementing the function above on the results from our model
map_results_df['danger'] = map_results_df['danger'].map(lambda val: discrete_results(val))

In [22]:
# checking the result
map_results_df.head(10)

Unnamed: 0,latitude,longitude,danger
0,38.518698,-75.495134,4
1,38.518824,-75.667357,0
2,38.519077,-76.011804,2
3,38.519204,-76.184028,0
4,38.653446,-75.495134,1
5,38.653572,-75.667357,0
6,38.653699,-75.83958,2
7,38.653825,-76.011804,0
8,38.788194,-75.495134,3
9,38.78832,-75.667357,1


In [23]:
# checking the distribution of the categories
map_results_df['danger'].value_counts()

0    53
1    30
2    27
4    12
3     7
Name: danger, dtype: int64

In [24]:
# saving the results to a csv for mapping
map_results_df.to_csv('./danger-intensity-by-location.csv')