# Cleaning Tweets

### Imports

In [46]:
import pandas as pd
import random
import numpy as np
import regex as re
import nltk
from bs4 import BeautifulSoup

import warnings
warnings.filterwarnings("ignore")

In [47]:
file_path_tweets = '../datasets/raw_data/raw_tweets.csv'

### DataFrame Set Up

In [48]:
df = pd.read_csv(file_path_tweets)
df

Unnamed: 0,username,text,date
0,Julie Wilcox WX,An emergency manager once explained it this wa...,2019-08-31 20:22:16
1,Hays Wood,@Xfinity get your shit together! First college...,2019-08-31 17:54:07
2,Ed Vallee | Empire Weather LLC,Have had many ask me what to do for prep. Here...,2019-08-31 15:50:58
3,Joselin Sundin,"@RogersHelps - GM, is any outage in the Duffer...",2019-08-31 14:28:14
4,Shelley Woodroof,Solar lights that people use outside to light ...,2019-08-31 10:45:32
...,...,...,...
11460,Niru,"With the power outage, and no WiFi I feel like...",2014-01-03 04:58:08
11461,Sean,.@SawneeEMC No one would answer the damn phone...,2014-01-03 03:55:06
11462,Heather Rollo,This is such a huge power outage.... and no st...,2014-01-02 21:16:41
11463,Junior,Thank you power outage for no work till who kn...,2014-01-01 19:00:06


In [49]:
# Change the date of the tweet was posted to just the date
df['date'] = pd.to_datetime(df['date']).dt.date

In [50]:
df.head()

Unnamed: 0,username,text,date
0,Julie Wilcox WX,An emergency manager once explained it this wa...,2019-08-31
1,Hays Wood,@Xfinity get your shit together! First college...,2019-08-31
2,Ed Vallee | Empire Weather LLC,Have had many ask me what to do for prep. Here...,2019-08-31
3,Joselin Sundin,"@RogersHelps - GM, is any outage in the Duffer...",2019-08-31
4,Shelley Woodroof,Solar lights that people use outside to light ...,2019-08-31


In [51]:
df.shape

(11465, 3)

In [52]:
df.dtypes

username    object
text        object
date        object
dtype: object

In [53]:
df.isnull().sum()

username     3
text         7
date        11
dtype: int64

In [54]:
df.dropna(how='any',inplace=True)

In [55]:
df.shape

(11451, 3)

In [56]:
df.isnull().sum()

username    0
text        0
date        0
dtype: int64

### Removing Stop Words and HTML

In [57]:
stopwords = nltk.corpus.stopwords.words('english')

In [58]:
def tweet_cleaning(raw_tweet):
    
    # 1. Remove punctuation
    letters_numbers_only = re.sub('[^A-Za-z0-9]+', " ", raw_tweet)
    
    
    # 2. Convert to lower case, split into individual words.
    words = letters_numbers_only.lower().split()
    
    # 3. Stopwords
    stops = set(stopwords)
    
    # 4. Search Stopwords
    meaningful_words = [w for w in words if not w in stops]
    
    # 5. Join 
    return(" ".join(meaningful_words))

In [59]:
# Tweet Count
total_tweets = len(df)
print(f'There are {total_tweets} tweets.')


# List holders
clean_tweets = []

#Running the functions
print("Cleaning and parsing tweets...")

j = 0
for text in df['text']:
    # Join clean tweets
    clean_tweets.append(tweet_cleaning(text))
    
    # Message to keep track
    if (j + 1) % 1_000 == 0:
        print(f'Tweet {j + 1} of {total_tweets}.')
    
    j += 1

print("COMPLETE")

There are 11451 tweets.
Cleaning and parsing tweets...
Tweet 1000 of 11451.
Tweet 2000 of 11451.
Tweet 3000 of 11451.
Tweet 4000 of 11451.
Tweet 5000 of 11451.
Tweet 6000 of 11451.
Tweet 7000 of 11451.
Tweet 8000 of 11451.
Tweet 9000 of 11451.
Tweet 10000 of 11451.
Tweet 11000 of 11451.
COMPLETE


In [60]:
df['clean_text'] = clean_tweets

In [61]:
df

Unnamed: 0,username,text,date,clean_text
0,Julie Wilcox WX,An emergency manager once explained it this wa...,2019-08-31,emergency manager explained way interview imag...
1,Hays Wood,@Xfinity get your shit together! First college...,2019-08-31,xfinity get shit together first college footba...
2,Ed Vallee | Empire Weather LLC,Have had many ask me what to do for prep. Here...,2019-08-31,many ask prep done orlando fl reference water ...
3,Joselin Sundin,"@RogersHelps - GM, is any outage in the Duffer...",2019-08-31,rogershelps gm outage dufferin queen area sinc...
4,Shelley Woodroof,Solar lights that people use outside to light ...,2019-08-31,solar lights people use outside light pathways...
...,...,...,...,...
11460,Niru,"With the power outage, and no WiFi I feel like...",2014-01-03,power outage wifi feel like didnt work school
11461,Sean,.@SawneeEMC No one would answer the damn phone...,2014-01-03,sawneeemc one would answer damn phone submit o...
11462,Heather Rollo,This is such a huge power outage.... and no st...,2014-01-02,huge power outage stoplights work
11463,Junior,Thank you power outage for no work till who kn...,2014-01-01,thank power outage work till knows pic twitter...


### Filter by Date

In [62]:
targets = pd.read_csv('../datasets/outages_since_2014.csv')

In [63]:
targets.head()

Unnamed: 0,Date Event Began,Time Event Began,Date of Restoration,Time of Restoration,Area Affected,NERC Region,Event Type,Number of Customers Affected,Alert Criteria
0,10/22/2014,10:46 PM,10/22/2014,10:47 PM,"New Hampshire, Maine, Massachusetts, Rhode Isl...",NPCC,Severe Weather,66650,
1,6/23/2015,6:30 PM,6/24/2015,5:00 AM,"Connecticut, Maine, Massachusetts, New Hampshi...",NPCC,Severe Weather,62442,"Loss of electric service to more than 50,000 c..."
2,8/4/2015,7:17 AM,8/5/2015,12:52 PM,Massachusetts: Rhode Island:,NPCC,Severe Weather,132000,"Loss of electric service to more than 50,000 c..."
3,7/22/2016,11:50 PM,7/23/2016,9:10 AM,Massachusetts: Connecticut: Rhode Island: New ...,NPCC,Severe Weather,57058,"Loss of electric service to more than 50,000 c..."
4,7/23/2016,7:30 PM,7/24/2016,7:30 AM,Connecticut: Massachusetts: New Hampshire: Ver...,NPCC,Severe Weather,101073,"Loss of electric service to more than 50,000 c..."


In [64]:
targets.dtypes

Date Event Began                object
Time Event Began                object
Date of Restoration             object
Time of Restoration             object
Area Affected                   object
NERC Region                     object
Event Type                      object
Number of Customers Affected    object
Alert Criteria                  object
dtype: object

In [65]:
targets['Date Event Began'] = pd.to_datetime(targets['Date Event Began']).dt.date

In [66]:
targets['Date of Restoration'] = pd.to_datetime(targets['Date of Restoration']).dt.date

In [67]:
targets

Unnamed: 0,Date Event Began,Time Event Began,Date of Restoration,Time of Restoration,Area Affected,NERC Region,Event Type,Number of Customers Affected,Alert Criteria
0,2014-10-22,10:46 PM,2014-10-22,10:47 PM,"New Hampshire, Maine, Massachusetts, Rhode Isl...",NPCC,Severe Weather,66650,
1,2015-06-23,6:30 PM,2015-06-24,5:00 AM,"Connecticut, Maine, Massachusetts, New Hampshi...",NPCC,Severe Weather,62442,"Loss of electric service to more than 50,000 c..."
2,2015-08-04,7:17 AM,2015-08-05,12:52 PM,Massachusetts: Rhode Island:,NPCC,Severe Weather,132000,"Loss of electric service to more than 50,000 c..."
3,2016-07-22,11:50 PM,2016-07-23,9:10 AM,Massachusetts: Connecticut: Rhode Island: New ...,NPCC,Severe Weather,57058,"Loss of electric service to more than 50,000 c..."
4,2016-07-23,7:30 PM,2016-07-24,7:30 AM,Connecticut: Massachusetts: New Hampshire: Ver...,NPCC,Severe Weather,101073,"Loss of electric service to more than 50,000 c..."
5,2016-09-11,12:05 PM,2016-09-11,3:10 PM,Connecticut: Massachusetts: New Hampshire: Rho...,NPCC,Severe Weather,57960,"Loss of electric service to more than 50,000 c..."
6,2017-02-09,4:05 PM,2017-02-10,5:15 AM,Connecticut: Massachusetts: Rhode Island:,NPCC,Severe Weather,11525,"Loss of electric service to more than 50,000 c..."
7,2017-03-02,12:20 PM,2017-03-02,11:45 PM,Connecticut: Maine: Massachusetts: New Hampshi...,NPCC,Severe Weather,54316,"Loss of electric service to more than 50,000 c..."
8,2017-10-29,11:40 PM,2017-11-01,6:08 PM,Connecticut: Massachusetts: New Hampshire: Mai...,NPCC,Severe Weather,310453,"Loss of electric service to more than 50,000 c..."
9,2018-03-02,1:51 PM,2018-03-05,1:18 PM,Connecticut: Massachusetts: Rhode Island:,NPCC,Severe Weather,325000,"Loss of electric service to more than 50,000 c..."


In [68]:
#Each value in the nested list is a date that the blackout occured
blackouts = [pd.date_range(start = targets.loc[i ,'Date Event Began'], end = targets.loc[i, 'Date of Restoration'])for i in targets.index]

# Flatten the 2d list to 1d
# https://www.geeksforgeeks.org/python-ways-to-flatten-a-2d-list/
blackouts = [j for sub in blackouts for j in sub]

# Number of tweets that occured during an actual blackout
df['date'].isin(blackouts).sum()

df['target'] = df['date'].isin(blackouts)
df['target'] *= 1

In [69]:
# blackouts

In [70]:
df.target.sum()

30

In [71]:
df[df.target == 1]

Unnamed: 0,username,text,date,clean_text,target
4051,Brandon Kopp,@PepcoConnect If you want some low-hanging fru...,2018-03-03,pepcoconnect want low hanging fruit work overn...,1
4052,Richard Balducci,@MyBGE third major outage in our neighborhood ...,2018-03-03,mybge third major outage neighborhood lose foo...,1
4053,The Colonel 100,I think it’s safe to say if I got caught in a ...,2018-03-03,think safe say got caught power outage heat wa...,1
4054,Stephani Shelton,@JCP_L @ABC7NY @News12NJ this link does not w...,2018-03-03,jcp l abc7ny news12nj link work added lack inf...,1
4055,Kimmy Schweizer,Are you still updating outage maps? I see no c...,2018-03-03,still updating outage maps see changes please ...,1
4056,Julia,@verizon @VerizonSupport any updates on outag...,2018-03-03,verizon verizonsupport updates outages south s...,1
4057,Court Bauer,.@ConEdison has done a deplorable job. Check t...,2018-03-03,conedison done deplorable job check replies on...,1
4058,C. Peterson,Thanks for the hard work! It's much appreciate...,2018-03-03,thanks hard work much appreciated understand p...,1
4059,Colleen Krieger,@VerizonSupport is there a Fios outage in Wash...,2018-03-03,verizonsupport fios outage washington dc inter...,1
4060,erin,yesterday was senior skip day & I didn’t go to...,2018-03-03,yesterday senior skip day go school got text s...,1


### Class Imbalance Fix - Sampling

In [72]:
#Undersample majority class
fakenews = df[df['target'] == 0]
fakenews_subset = fakenews.sample(n = 1000, random_state=42)
fakenews_subset

final_df = pd.concat([df[df['target'] == 1], fakenews_subset], axis = 0)
final_df.shape

(1030, 5)

In [73]:
final_df.shape[0]

1030

### Simulating Random Boston Coordinates

In [74]:
np.random.seed(42)

In [75]:
#latitude
coor_lat=range(42229077, 42397652)
latitudes=np.random.choice(coor_lat, size=final_df.shape[0])
latitudes=latitudes/1000000

In [76]:
#longitude
coor_long=range(-71203220, -70987133)
longitude=np.random.choice(coor_long, size=final_df.shape[0])
longitude=longitude/1000000

In [77]:
final_df['lat']=latitudes
final_df['long']=longitude

In [78]:
final_df.head()

Unnamed: 0,username,text,date,clean_text,target,lat,long
4051,Brandon Kopp,@PepcoConnect If you want some low-hanging fru...,2018-03-03,pepcoconnect want low hanging fruit work overn...,1,42.351035,-71.102025
4052,Richard Balducci,@MyBGE third major outage in our neighborhood ...,2018-03-03,mybge third major outage neighborhood lose foo...,1,42.375944,-71.068392
4053,The Colonel 100,I think it’s safe to say if I got caught in a ...,2018-03-03,think safe say got caught power outage heat wa...,1,42.361009,-71.182611
4054,Stephani Shelton,@JCP_L @ABC7NY @News12NJ this link does not w...,2018-03-03,jcp l abc7ny news12nj link work added lack inf...,1,42.332771,-71.05567
4055,Kimmy Schweizer,Are you still updating outage maps? I see no c...,2018-03-03,still updating outage maps see changes please ...,1,42.348956,-71.032482


### Save DF

In [79]:
final_df.to_csv('../datasets/clean_tweets.csv')