# Clean tweets: Text pre-processing

## Frequency counts of tokens

In [1]:
words = [
   'red', 'green', 'black', 'pink', 'black', 'white', 'black', 'eyes',
   'white', 'black', 'orange', 'pink', 'pink', 'red', 'red', 'white', 'orange',
   'white', "black", 'pink', 'green', 'green', 'pink', 'green', 'pink',
   'white', 'orange', "orange", 'red'
]
from collections import Counter

word_counts = Counter(words)
top_four = word_counts.most_common(4)
print(top_four)

[('pink', 6), ('black', 5), ('white', 5), ('red', 4)]


## Regex

In [2]:
import re

### Replace everything except alphanumeric characters, whitespace and periods with a space.


In [3]:
text = 'Colorless green ideas sleep furiously.　言語学。#chomsky '
text = re.sub('[^a-zA-Z0-9_.]', ' ', text)
print(text)

Colorless green ideas sleep furiously.      chomsky 


### The latter seems a bit strange and might confuse our tokenizer. Let's replace multiple whitespaces with one.

In [4]:
text = 'Colorless green ideas sleep furiously. 言語学。#chomsky'
text = re.sub('[^a-zA-Z0-9_.]', ' ', text)
text = re.sub('(\s+)', ' ', text)
print(text)

Colorless green ideas sleep furiously. chomsky


### Replace usernames with generic: USERNAME

In [5]:
text = 'Colorless green ideas sleep furiously. @chomsky'
text = re.sub('@([A-Za-z0-9_]+)', 'USERNAME', text)
print(text)

Colorless green ideas sleep furiously. USERNAME


### Replace all urls with a space

In [6]:
text = 'Colorless green ideas sleep furiously. https://www.chomsky.com'
text = re.sub('http([A-Za-z0-9_:./]+)', ' ', text)
print(text)

Colorless green ideas sleep furiously.  


### Find all urls, enumerate and print

In [7]:
text = '<p>Contents :</p><a href="https://w3resource.com">Python Examples</a><a href="http://github.com">Even More Examples</a>'
urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
    
for num, name in enumerate(urls, start=1):
    print("Link {}: {}".format(num, name))

Link 1: https://w3resource.com
Link 2: http://github.com


### Keep some emojis 🤣

In [8]:
text = 'Colorless green ideas sleep furiously. 🤣😃😄'
text = re.sub('[^A-Za-z0-9,.🤣😃]', ' ', text)
text = re.sub('(\s+)', ' ', text)
print(text)

Colorless green ideas sleep furiously. 🤣😃 


### Replace punctuation

In [9]:
text = 'Colorless! green; ideas? sleep, furiously.' 
text = (re.sub("[:,!?;]", "", text))
print(text)

Colorless green ideas sleep furiously.


### Replace a token with another token

In [10]:
text = 'Transparent green ideas sleep furiously.'
text = (re.sub("Transparent", "Colorless", text))
print(text)

Colorless green ideas sleep furiously.


### Pandas

In [11]:
import pandas as pd

In [12]:
text = pd.read_csv('chomsky.csv', encoding='utf-8')

In [13]:
text.head() 

Unnamed: 0,Tweet,Label
0,Colorless green ideas sleep furiously. 言語学。,0
1,Colorless green ideas sleep furiously. 言語学。,0
2,Colorless green ideas sleep furiously. 言語学。,0
3,Colorless green ideas sleep furiously. 言語学。,0
4,Colorless green ideas sleep furiously. 言語学。,0


In [14]:
len(text)

5

In [15]:
text = text.Tweet.values # transform the dataframe to an array

In [16]:
new_text = []

In [17]:
from nltk.corpus import stopwords
    
for i in range(0, len(text)):
    tweet = re.sub('[^a-zA-Z\s+]','', text[i]).lower().split()
    tweet = [word for word in tweet if not word in set(stopwords.words('english'))] #actually I think we don't have any stopwords here lol
    tweet = ' '.join(tweet)
    new_text.append(tweet)
    
print(new_text)

['colorless green ideas sleep furiously', 'colorless green ideas sleep furiously', 'colorless green ideas sleep furiously', 'colorless green ideas sleep furiously', 'colorless green ideas sleep furiously']


## Standardizing text
Typos, slang, emoticons, variants, abbreviations, ...

In [None]:
lookup_dict = {'rt':'Retweet', 'dm':'direct message', "awsm" : "awesome", "luv" :"love", "..."}
def _lookup_words(input_text):
    words = input_text.split() 
    new_words = [] 
    for word in words:
        if word.lower() in lookup_dict:
            word = lookup_dict[word.lower()]
        new_words.append(word) new_text = " ".join(new_words) 
        return new_text

_lookup_words("RT this is a retweeted tweet by Shivam Bansal")




lookup = {
    'coooool': 'cool'
    'RT': 'retweet'
    ''
}