# Cleaning Tweets

### Imports

In [1]:
import pandas as pd
import random
import numpy as np
import regex as re
import nltk
from bs4 import BeautifulSoup

import warnings


In [2]:
file_name_outages='../datasets/raw_tweets.csv'
file_name_random='../datasets/irrelevent_tweets.csv'

### DataFrame Set Up

In [3]:
df1=pd.read_csv(file_name_outages).drop(columns='Unnamed: 0')
df1['label']='outage'
df2=pd.read_csv(file_name_random).drop(columns='Unnamed: 0')
df2['label']='irrelevant'

In [4]:
df=pd.concat([df1,df2],axis=0)

In [5]:
df['dates']=pd.to_datetime(df.date).dt.date

In [6]:
df['dates']=pd.to_datetime(df.dates)

In [7]:
df.dtypes

username            object
text                object
date                object
label               object
dates       datetime64[ns]
dtype: object

In [8]:
df=df.drop(columns=['date'])

In [9]:
df.head()

Unnamed: 0,username,text,label,dates
0,JulieWilcoxWX,An emergency manager once explained it this wa...,outage,2019-08-31
1,hays_wood,@Xfinity get your shit together! First college...,outage,2019-08-31
2,EdValleeWx,Have had many ask me what to do for prep. Here...,outage,2019-08-31
3,teamworkf1,"@RogersHelps - GM, is any outage in the Duffer...",outage,2019-08-31
4,shellz_woo,Solar lights that people use outside to light ...,outage,2019-08-31


In [10]:
df.dropna(how='any',inplace=True)

### Removing Stop Words and HTML

In [11]:
stopwords = nltk.corpus.stopwords.words('english')

In [12]:
def tweet_cleaning(raw_tweet):
    
    # 1. Remove HTML
    tweet_text = BeautifulSoup(raw_tweet).get_text()
    
    # 2. Remove punctuation, keep numbers with 1-4 digits
    letters_numbers_only = re.sub("[^a-zA-Z]", " ", tweet_text)
    latin = re.sub("r'[\p{Latin}]'", " ", letters_numbers_only)
    
    # 3. Convert to lower case, split into individual words.
    words = latin.lower().split()
    
    # 4. Search
    stops = set(stopwords)
    
    # 5. Stop Words
    meaningful_words = [w for w in words if not w in stops]
    
    # 6. Join 
    return(" ".join(meaningful_words))

In [13]:
# Tweet Count
total_tweets = df.shape[0]
print(f'There are {total_tweets} tweets.')

# List holders
clean_train_tweets = []
clean_test_tweets = []

There are 61263 tweets.


In [14]:
warnings.filterwarnings('ignore')

#Running the functions
print("Cleaning and parsing the training set movie reviews...")

j = 0
for train_tweet in df['text']:
    # Join clean reviews
    clean_train_tweets.append(tweet_cleaning(train_tweet))
    
    # Message to keep track
    if (j + 1) % 1000 == 0:
        print(f'Tweet {j + 1} of {total_tweets}.')
    
    j += 1

print("Cleaning and parsing the testing set ...")
warnings.filterwarnings('default')

Cleaning and parsing the training set movie reviews...
Tweet 1000 of 61263.
Tweet 2000 of 61263.
Tweet 3000 of 61263.
Tweet 4000 of 61263.
Tweet 5000 of 61263.
Tweet 6000 of 61263.
Tweet 7000 of 61263.
Tweet 8000 of 61263.
Tweet 9000 of 61263.
Tweet 10000 of 61263.
Tweet 11000 of 61263.
Tweet 12000 of 61263.
Tweet 13000 of 61263.
Tweet 14000 of 61263.
Tweet 15000 of 61263.
Tweet 16000 of 61263.
Tweet 17000 of 61263.
Tweet 18000 of 61263.
Tweet 19000 of 61263.
Tweet 20000 of 61263.
Tweet 21000 of 61263.
Tweet 22000 of 61263.
Tweet 23000 of 61263.
Tweet 24000 of 61263.
Tweet 25000 of 61263.
Tweet 26000 of 61263.
Tweet 27000 of 61263.
Tweet 28000 of 61263.
Tweet 29000 of 61263.
Tweet 30000 of 61263.
Tweet 31000 of 61263.
Tweet 32000 of 61263.
Tweet 33000 of 61263.
Tweet 34000 of 61263.
Tweet 35000 of 61263.
Tweet 36000 of 61263.
Tweet 37000 of 61263.
Tweet 38000 of 61263.
Tweet 39000 of 61263.
Tweet 40000 of 61263.
Tweet 41000 of 61263.
Tweet 42000 of 61263.
Tweet 43000 of 61263.
Tweet 44

In [16]:
df['clean_text']=clean_train_tweets

In [17]:
df.tail()

Unnamed: 0,username,text,label,dates,clean_text
50013,kaedenoha_89,待ってどれwwww,irrelevant,2019-08-31,wwww
50014,hhacffcvhim,للآن يتصالحون واالشعوب مركونين بالفراغ المميت ...,irrelevant,2019-08-31,
50015,hito_yr,"Jimin in Paris, official video \n#JIMINpic.twi...",irrelevant,2019-08-31,jimin paris official video jiminpic twitter co...
50016,joelbirch,Your name is MŸKKECÅRLT now. You will require ...,irrelevant,2019-08-31,name kkec rlt require assembly
50017,takapzdra91,まひちゃん熱中症ならなくてよかった(๑•ᴗ•๑),irrelevant,2019-08-31,


In [None]:
df.dropna(how='any',inplace=True)

### Filter by Date

In [None]:
### PETER

In [None]:
targets=pd.read_csv('./Outages_since_2014.csv')

In [None]:
targets.head()

In [None]:
# and each value in the nested list is a date that the blackout occured
blackouts = [pd.date_range(start = targets.loc[i ,'Date Event Began'], end = targets.loc[i, 'Date of Restoration'])for i in targets.index]

# Flatten the 2d list to 1d
# https://www.geeksforgeeks.org/python-ways-to-flatten-a-2d-list/
blackouts = [j for sub in blackouts for j in sub]

# Number of tweets that occured during an actual blackout
df['dates'].isin(blackouts).sum()

df['target'] = df['dates'].isin(blackouts)
df['target'] *= 1

In [None]:
df.head()

In [None]:
df.target.sum()

### Class Imbalance Fix - Sampling With Replacement

In [None]:
fakenews = df[df['target'] == 0]
fakenews_subset = fakenews.sample(n = 1000, random_state=42)
fakenews_subset

final_df = pd.concat([df[df['target'] == 1], fakenews_subset], axis = 0)
final_df.shape

In [None]:
final_df.shape[0]

In [None]:
latitudes.shape

### Simulating Random Boston Coordinates

In [None]:
np.random.seed(42)

In [None]:
coor_lat=range(42229077, 42397652)
latitudes=np.random.choice(coor_lat, size=final_df.shape[0])
latitudes=latitudes/1000000

In [None]:
coor_long=range(-71893220, -70987133)
longitude=np.random.choice(coor_long, size=final_df.shape[0])
longitude=longitude/1000000

In [None]:
final_df['lat']=latitudes
final_df['long']=longitude

In [None]:
final_df.head()

### Save DF

In [None]:
final_df.to_csv('./clean_tweets.csv')