# Clean that twitter data!

Use of the "Table of Contents" nbextension is highly recommended.

### Basic imports, settings

In [1]:
import numpy as np
import pandas as pd
import nltk
import os
import json
import sys

pd.set_option('display.max_columns', 200)
pd.set_option('display.max_colwidth',100)

### Check variables in memory

In [2]:
import sys

# These are the usual ipython objects, including this one you are creating
ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']

# Get a sorted list of the objects and their sizes
sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)

[('np', 80), ('pd', 80)]

## Read data

In [3]:
dir_path = '../data/tweets_maga'
data = pd.DataFrame()

print('Loading tweets from', dir_path)
file_names = [file for file in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, file))]
for file_name in file_names:
    file_path = dir_path + '/' + file_name
    with open(file_path, 'r', encoding='utf-8') as file:
        tweets = pd.read_json(file)
        data = pd.concat([data, tweets], ignore_index=True)
        
print('Successfully loaded {} tweets from {} files.'.format(len(data), len(file_names)))

Loading tweets from ../data/tweets_maga
Successfully loaded 16000 tweets from 16 files.


In [16]:
original_text = data.text
original_text

0         i’m confused. is this a new “greatest” or the same as the old? all of your scandals are hard to...
1                                                                       i'm not yet tired of winning, sir!  
2          you are the true patriots!  if antifa physically assaults you in any way you can legally defen...
3        it is amazing to watch  throw his reputation down the toilet. my guess  has something on him, or...
4                                                                   whooooohooooo! i’m so excited for you!  
5         lied about wikileaks in \n\nthe grand-jury redactions in special counsel  report show that pres...
6                                                           never polled and don't know anyone that has. \n 
7        \n\n needs to get off their ass and put their foot down!\n\ndems/antifa (same thing) crossed the...
8                                                                                             beautiful!\n\n
9                 d

## Clean data

### Lower case, strip

In [5]:
# everything in lower case, strip
data['text'] = data.text.str.lower()
data['text'] = data.text.str.strip()

### Remove tags, usernames, hyperlinks

In [6]:
# remove every #tag 
data['text'] = data.text.str.replace(r'#\S*', '')

In [7]:
# remove every @username
data['text'] = data.text.str.replace(r'@\S*', '')

In [8]:
# remove links
data['text'] = data.text.str.replace(r'http(\S)*', '')

### Remove punctuation

In [71]:
data.text.str.encode('utf-8').str.replace(r'\\.* ', '')

0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
5       NaN
6       NaN
7       NaN
8       NaN
9       NaN
10      NaN
11      NaN
12      NaN
13      NaN
14      NaN
15      NaN
16      NaN
17      NaN
18      NaN
19      NaN
20      NaN
21      NaN
22      NaN
23      NaN
24      NaN
25      NaN
26      NaN
27      NaN
28      NaN
29      NaN
         ..
15970   NaN
15971   NaN
15972   NaN
15973   NaN
15974   NaN
15975   NaN
15976   NaN
15977   NaN
15978   NaN
15979   NaN
15980   NaN
15981   NaN
15982   NaN
15983   NaN
15984   NaN
15985   NaN
15986   NaN
15987   NaN
15988   NaN
15989   NaN
15990   NaN
15991   NaN
15992   NaN
15993   NaN
15994   NaN
15995   NaN
15996   NaN
15997   NaN
15998   NaN
15999   NaN
Name: text, Length: 16000, dtype: float64

### Tokenize

In [65]:
data.text[15992]

' .\n\nin   is king of the \nand he gets his  &amp;  from people in   \n ---&gt;  (\n\n\n\n \n\n \n\n\n '

In [63]:
from nltk.tokenize import TweetTokenizer

# strip_handles because I'm not sure what it does (maybe removes usernames?),
# reduce_len because I want 'waaaaaay' to be equal to 'waaay'
tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)

In [18]:
data.text.str..apply(tokenizer.tokenize)

0        [i, ’, m, confused, ., is, this, a, new, “, greatest, ”, or, the, same, as, the, old, ?, all, of...
1                                                             [i'm, not, yet, tired, of, winning, ,, sir, !]
2        [you, are, the, true, patriots, !, if, antifa, physically, assaults, you, in, any, way, you, can...
3        [it, is, amazing, to, watch, throw, his, reputation, down, the, toilet, ., my, guess, has, somet...
4                                                          [whooohooo, !, i, ’, m, so, excited, for, you, !]
5        [lied, about, wikileaks, in, the, grand-jury, redactions, in, special, counsel, report, show, th...
6                                                    [never, polled, and, don't, know, anyone, that, has, .]
7        [needs, to, get, off, their, ass, and, put, their, foot, down, !, dems, /, antifa, (, same, thin...
8                                                                                             [beautiful, !]
9        [did, mean

In [19]:
tokenizer.tokenize('I\'m I`m "asd" ˝asd˝')

["I'm", 'I', '`', 'm', '"', 'asd', '"', '˝', 'asd', '˝']