# Cleaning Tweets

### Imports

In [1]:
import pandas as pd
import random
import numpy as np
import regex as re
import nltk
from bs4 import BeautifulSoup

import warnings
warnings.filterwarnings("ignore")

### DataFrame Set Up

In [2]:
df1=pd.read_csv('../datasets/raw_data/raw_tweets.csv').drop(columns='Unnamed: 0')
df1['label']='outage'
df2=pd.read_csv('../datasets/raw_data/irrelevent_tweets.csv').drop(columns='Unnamed: 0')
df2['label']='irrelevant'

In [3]:
df=pd.concat([df1,df2],axis=0)

In [4]:
df['dates']=pd.to_datetime(df.date).dt.date

In [5]:
df['dates']=pd.to_datetime(df.dates)

In [6]:
df.dtypes

username            object
text                object
date                object
label               object
dates       datetime64[ns]
dtype: object

In [7]:
df=df.drop(columns=['date'])

In [8]:
df.head()

Unnamed: 0,username,text,label,dates
0,JulieWilcoxWX,An emergency manager once explained it this wa...,outage,2019-08-31
1,hays_wood,@Xfinity get your shit together! First college...,outage,2019-08-31
2,EdValleeWx,Have had many ask me what to do for prep. Here...,outage,2019-08-31
3,teamworkf1,"@RogersHelps - GM, is any outage in the Duffer...",outage,2019-08-31
4,shellz_woo,Solar lights that people use outside to light ...,outage,2019-08-31


In [9]:
df.dropna(how='any',inplace=True)

### Removing Stop Words and HTML

In [10]:
stopwords = nltk.corpus.stopwords.words('english')

In [11]:
def tweet_cleaning(raw_tweet):
    
    # 1. Remove HTML
    tweet_text = BeautifulSoup(raw_tweet).get_text()
    
    # 2. Remove punctuation, keep numbers with 1-4 digits
    letters_numbers_only = re.sub("[^a-zA-Z]", " ", tweet_text)
    latin = re.sub("r'[\p{Latin}]'", " ", letters_numbers_only)
    
    # 3. Convert to lower case, split into individual words.
    words = latin.lower().split()
    
    # 4. Search
    stops = set(stopwords)
    
    # 5. Stop Words
    meaningful_words = [w for w in words if not w in stops]
    
    # 6. Join 
    return(" ".join(meaningful_words))

In [12]:
# Tweet Count
total_tweets = df.shape[0]
print(f'There are {total_tweets} tweets.')

# List holders
clean_train_tweets = []
clean_test_tweets = []

There are 61263 tweets.


In [13]:
#Running the functions
print("Cleaning and parsing tweets...")

j = 0
for train_tweet in df['text']:
    # Join clean tweets
    clean_train_tweets.append(tweet_cleaning(train_tweet))
    
    # Message to keep track
    if (j + 1) % 1000 == 0:
        print(f'Tweet {j + 1} of {total_tweets}.')
    
    j += 1

print("Cleaning and parsing the testing set ...")

Cleaning and parsing tweets...
Tweet 1000 of 61263.
Tweet 2000 of 61263.
Tweet 3000 of 61263.
Tweet 4000 of 61263.
Tweet 5000 of 61263.
Tweet 6000 of 61263.
Tweet 7000 of 61263.
Tweet 8000 of 61263.
Tweet 9000 of 61263.
Tweet 10000 of 61263.
Tweet 11000 of 61263.
Tweet 12000 of 61263.
Tweet 13000 of 61263.
Tweet 14000 of 61263.
Tweet 15000 of 61263.
Tweet 16000 of 61263.
Tweet 17000 of 61263.
Tweet 18000 of 61263.
Tweet 19000 of 61263.
Tweet 20000 of 61263.
Tweet 21000 of 61263.
Tweet 22000 of 61263.
Tweet 23000 of 61263.
Tweet 24000 of 61263.
Tweet 25000 of 61263.
Tweet 26000 of 61263.
Tweet 27000 of 61263.
Tweet 28000 of 61263.
Tweet 29000 of 61263.
Tweet 30000 of 61263.
Tweet 31000 of 61263.
Tweet 32000 of 61263.
Tweet 33000 of 61263.
Tweet 34000 of 61263.
Tweet 35000 of 61263.
Tweet 36000 of 61263.
Tweet 37000 of 61263.
Tweet 38000 of 61263.
Tweet 39000 of 61263.
Tweet 40000 of 61263.
Tweet 41000 of 61263.
Tweet 42000 of 61263.
Tweet 43000 of 61263.
Tweet 44000 of 61263.
Tweet 4500

In [14]:
df['clean_text']=clean_train_tweets

In [15]:
df.tail()

Unnamed: 0,username,text,label,dates,clean_text
50013,kaedenoha_89,待ってどれwwww,irrelevant,2019-08-31,wwww
50014,hhacffcvhim,للآن يتصالحون واالشعوب مركونين بالفراغ المميت ...,irrelevant,2019-08-31,
50015,hito_yr,"Jimin in Paris, official video \n#JIMINpic.twi...",irrelevant,2019-08-31,jimin paris official video jiminpic twitter co...
50016,joelbirch,Your name is MŸKKECÅRLT now. You will require ...,irrelevant,2019-08-31,name kkec rlt require assembly
50017,takapzdra91,まひちゃん熱中症ならなくてよかった(๑•ᴗ•๑),irrelevant,2019-08-31,


In [16]:
df.dropna(how='any',inplace=True)

### Filter by Date

In [17]:
targets=pd.read_csv('../datasets/DoE/Outages_since_2014.csv')

In [18]:
targets.head()

Unnamed: 0,Date Event Began,Time Event Began,Date of Restoration,Time of Restoration,Area Affected,NERC Region,Event Type,Number of Customers Affected,Alert Criteria
0,10/22/2014,10:46 PM,10/22/2014,10:47 PM,"New Hampshire, Maine, Massachusetts, Rhode Isl...",NPCC,Severe Weather,66650,
1,6/23/2015,6:30 PM,6/24/2015,5:00 AM,"Connecticut, Maine, Massachusetts, New Hampshi...",NPCC,Severe Weather,62442,"Loss of electric service to more than 50,000 c..."
2,8/4/2015,7:17 AM,8/5/2015,12:52 PM,Massachusetts: Rhode Island:,NPCC,Severe Weather,132000,"Loss of electric service to more than 50,000 c..."
3,7/22/2016,11:50 PM,7/23/2016,9:10 AM,Massachusetts: Connecticut: Rhode Island: New ...,NPCC,Severe Weather,57058,"Loss of electric service to more than 50,000 c..."
4,7/23/2016,7:30 PM,7/24/2016,7:30 AM,Connecticut: Massachusetts: New Hampshire: Ver...,NPCC,Severe Weather,101073,"Loss of electric service to more than 50,000 c..."


In [19]:
#Each value in the nested list is a date that the blackout occured
blackouts = [pd.date_range(start = targets.loc[i ,'Date Event Began'], end = targets.loc[i, 'Date of Restoration'])for i in targets.index]

# Flatten the 2d list to 1d
# https://www.geeksforgeeks.org/python-ways-to-flatten-a-2d-list/
blackouts = [j for sub in blackouts for j in sub]

# Number of tweets that occured during an actual blackout
df['dates'].isin(blackouts).sum()

df['target'] = df['dates'].isin(blackouts)
df['target'] *= 1

In [20]:
df.head()

Unnamed: 0,username,text,label,dates,clean_text,target
0,JulieWilcoxWX,An emergency manager once explained it this wa...,outage,2019-08-31,emergency manager explained way interview imag...,0
1,hays_wood,@Xfinity get your shit together! First college...,outage,2019-08-31,xfinity get shit together first college footba...,0
2,EdValleeWx,Have had many ask me what to do for prep. Here...,outage,2019-08-31,many ask prep done orlando fl reference water ...,0
3,teamworkf1,"@RogersHelps - GM, is any outage in the Duffer...",outage,2019-08-31,rogershelps gm outage dufferin queen area sinc...,0
4,shellz_woo,Solar lights that people use outside to light ...,outage,2019-08-31,solar lights people use outside light pathways...,0


In [21]:
df.target.sum()

491

### Class Imbalance Fix - Sampling

In [22]:
#Undersample majority class
fakenews = df[df['target'] == 0]
fakenews_subset = fakenews.sample(n = 1000, random_state=42)
fakenews_subset

final_df = pd.concat([df[df['target'] == 1], fakenews_subset], axis = 0)
final_df.shape

(1491, 6)

In [23]:
final_df.shape[0]

1491

### Simulating Random Boston Coordinates

In [24]:
np.random.seed(42)

In [25]:
#latitude
coor_lat=range(42229077, 42397652)
latitudes=np.random.choice(coor_lat, size=final_df.shape[0])
latitudes=latitudes/1000000

In [26]:
#longitude
coor_long=range(-71203220, -70987133)
longitude=np.random.choice(coor_long, size=final_df.shape[0])
longitude=longitude/1000000

In [27]:
#zipcode
coor_zip=['02108', '02109', '02110', '02111', '02113', '02114', '02115', '02116', '02118', '02119', '02120', '02121', '02122', '02124', '02125', '02126', '02127', '02128', '02129', '02130', '02131', '02132', '02134', '02135', '02136', '02151', '02152', '02163', '02199', '02203', '02210', '02215', '02467']
zipcodes=np.random.choice(coor_zip, size=final_df.shape[0])

In [28]:
final_df['lat']=latitudes
final_df['long']=longitude
final_df['zip']=zipcodes

In [29]:
final_df.head()

Unnamed: 0,username,text,label,dates,clean_text,target,lat,long,zip
326,Amithridya1001,No update or action on supply issue at near 2 ...,outage,2019-07-23,update action supply issue near still even cus...,1,42.351035,-71.157572,2132
327,ellensweeps,I live in what is considered the largest conce...,outage,2019-07-23,live considered largest concentrated outage ma...,1,42.375944,-71.077229,2121
328,CoxHelp,"No, there is not an outage and your modem is r...",outage,2019-07-23,outage modem receiving signal us reset modem f...,1,42.361009,-71.062632,2135
329,Steven_McKie,No big deal @Xfinity I didn’t have to work tod...,outage,2019-07-23,big deal xfinity work today anything outage ni...,1,42.332771,-71.088215,2119
330,Larapic,So we filled the Adpt with post-its today and ...,outage,2019-07-23,filled adpt post today second outage ended dis...,1,42.348956,-71.200824,2215


### Save DF

In [30]:
final_df.to_csv('../datasets/clean-tweets/clean_tweets.csv')