# Clean that twitter data!

Use of the "Table of Contents" nbextension is highly recommended.

### Basic imports, settings

In [114]:
import numpy as np
import pandas as pd
import re
import nltk
import os
import json
import sys

pd.set_option('display.max_columns', 200)
pd.set_option('display.max_colwidth',100)

### Check variables in memory

In [2]:
import sys

# These are the usual ipython objects, including this one you are creating
ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']

# Get a sorted list of the objects and their sizes
sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)

[('np', 80), ('pd', 80)]

## Read data

In [3]:
dir_path = '../data/tweets_maga'
data = pd.DataFrame()

print('Loading tweets from', dir_path)
file_names = [file for file in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, file))]
for file_name in file_names:
    file_path = dir_path + '/' + file_name
    with open(file_path, 'r', encoding='utf-8') as file:
        tweets = pd.read_json(file)
        data = pd.concat([data, tweets], ignore_index=True)
        
print('Successfully loaded {} tweets from {} files.'.format(len(data), len(file_names)))

Loading tweets from ../data/tweets_maga
Successfully loaded 16000 tweets from 16 files.


In [16]:
original_text = data.text
original_text

0         i’m confused. is this a new “greatest” or the same as the old? all of your scandals are hard to...
1                                                                       i'm not yet tired of winning, sir!  
2          you are the true patriots!  if antifa physically assaults you in any way you can legally defen...
3        it is amazing to watch  throw his reputation down the toilet. my guess  has something on him, or...
4                                                                   whooooohooooo! i’m so excited for you!  
5         lied about wikileaks in \n\nthe grand-jury redactions in special counsel  report show that pres...
6                                                           never polled and don't know anyone that has. \n 
7        \n\n needs to get off their ass and put their foot down!\n\ndems/antifa (same thing) crossed the...
8                                                                                             beautiful!\n\n
9                 d

## Clean data

### Lower case, strip

In [5]:
# everything in lower case, strip
data['text'] = data.text.str.lower()
data['text'] = data.text.str.strip()

### Remove tags, usernames, hyperlinks

In [6]:
# remove every #tag 
data['text'] = data.text.str.replace(r'#\S*', '')

In [7]:
# remove every @username
data['text'] = data.text.str.replace(r'@\S*', '')

In [8]:
# remove links
data['text'] = data.text.str.replace(r'http(\S)*', '')

### Remove punctuation

In [99]:
# side note: f*ck those wierdly encoded characters, I'm gonna deal with them in another way... :@
data['text'] = data.text.str.replace(r'[\.,;:?!\(\)\n\t]','')

### Tokenize

In [100]:
from nltk.tokenize import TweetTokenizer

# strip_handles because I'm not sure what it does (maybe removes usernames?),
# reduce_len because I want 'waaaaaay' to be equal to 'waaay'
tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)

In [105]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/tandemelephant/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [135]:
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

def token_cleaner(token_list):
    
    stemmer = SnowballStemmer('english')
    stop_words = set(stopwords.words('english'))
    regex = regex = re.compile(r'\w\w+')
    
    tokens = [x for x in token_list if x not in stop_words]
    return list(filter(regex.search, [stemmer.stem(x) for x in tokens]))

In [136]:
data.text.apply(tokenizer.tokenize).apply(token_cleaner)

0                                            [confus, new, greatest, old, scandal, hard, follow, pussygrabb]
1                                                                                      [yet, tire, win, sir]
2                                               [true, patriot, antifa, physic, assault, way, legal, defend]
3                     [amaz, watch, throw, reput, toilet, guess, someth, mayb, bumbl, old, fool, hard, tell]
4                                                                                         [whooohooo, excit]
5        [lie, wikileak, grand-juri, redact, special, counsel, report, show, presid, trump, lie, knowledg...
6                                                                                 [never, poll, know, anyon]
7                                           [need, get, ass, put, foot, downdem, antifa, thing, cross, line]
8                                                                                                   [beauti]
9                  

In [134]:
regex = re.compile(r'[^(rt)]')
list(filter(re.search, ))

'damn straight  my president   rt    yes we want 4 more years of president trump'