In [1]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.6.3-py3-none-any.whl (1.5 MB)
Collecting regex
  Downloading regex-2021.10.8-cp38-cp38-win_amd64.whl (273 kB)
Collecting tqdm
  Downloading tqdm-4.62.3-py2.py3-none-any.whl (76 kB)
Collecting joblib
  Downloading joblib-1.1.0-py2.py3-none-any.whl (306 kB)
Installing collected packages: regex, tqdm, joblib, nltk
Successfully installed joblib-1.1.0 nltk-3.6.3 regex-2021.10.8 tqdm-4.62.3


In [2]:
import numpy as np
import pandas as pd
import nltk
import string
import re

full_df = pd.read_csv("twitter.csv", nrows=5000)
df = full_df[["text"]].copy()
df["text"] = df["text"].astype(str)
full_df.head()

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
0,119237,105834,True,Wed Oct 11 06:55:44 +0000 2017,@AppleSupport causing the reply to be disregar...,119236.0,
1,119238,ChaseSupport,False,Wed Oct 11 13:25:49 +0000 2017,@105835 Your business means a lot to us. Pleas...,,119239.0
2,119239,105835,True,Wed Oct 11 13:00:09 +0000 2017,@76328 I really hope you all change but I'm su...,119238.0,
3,119240,VirginTrains,False,Tue Oct 10 15:16:08 +0000 2017,@105836 LiveChat is online at the moment - htt...,119241.0,119242.0
4,119241,105836,True,Tue Oct 10 15:17:21 +0000 2017,@VirginTrains see attached error message. I've...,119243.0,119240.0


## Lower Casing

In [3]:
df["text_lower"] = df["text"].str.lower()
df.head()

Unnamed: 0,text,text_lower
0,@AppleSupport causing the reply to be disregar...,@applesupport causing the reply to be disregar...
1,@105835 Your business means a lot to us. Pleas...,@105835 your business means a lot to us. pleas...
2,@76328 I really hope you all change but I'm su...,@76328 i really hope you all change but i'm su...
3,@105836 LiveChat is online at the moment - htt...,@105836 livechat is online at the moment - htt...
4,@VirginTrains see attached error message. I've...,@virgintrains see attached error message. i've...


## Punctuations Removal

In [5]:
df.drop(["text_lower"], axis=1, inplace=True)

PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

df["text_wo_punct"] = df["text"].apply(lambda text: remove_punctuation(text))
df.head()

KeyError: "['text_lower'] not found in axis"

## Stopwords Removal

In [6]:
nltk.download('stopwords')
from nltk.corpus import stopwords
", ".join(stopwords.words('english'))

STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

df["text_wo_stop"] = df["text_wo_punct"].apply(lambda text: remove_stopwords(text))
df.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abdka\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


Unnamed: 0,text,text_wo_punct,text_wo_stop
0,@AppleSupport causing the reply to be disregar...,AppleSupport causing the reply to be disregard...,AppleSupport causing reply disregarded tapped ...
1,@105835 Your business means a lot to us. Pleas...,105835 Your business means a lot to us Please ...,105835 Your business means lot us Please DM na...
2,@76328 I really hope you all change but I'm su...,76328 I really hope you all change but Im sure...,76328 I really hope change Im sure wont Becaus...
3,@105836 LiveChat is online at the moment - htt...,105836 LiveChat is online at the moment https...,105836 LiveChat online moment httpstcoSY94VtU8...
4,@VirginTrains see attached error message. I've...,VirginTrains see attached error message Ive tr...,VirginTrains see attached error message Ive tr...


## Frequent Words Removal

In [7]:
from collections import Counter
cnt = Counter()
for text in df["text_wo_stop"].values:
    for word in text.split():
        cnt[word] += 1
        
cnt.most_common(10)

[('I', 34),
 ('us', 25),
 ('DM', 19),
 ('help', 17),
 ('httpstcoGDrqU22YpT', 12),
 ('AppleSupport', 11),
 ('Thanks', 11),
 ('phone', 9),
 ('Hi', 8),
 ('get', 8)]

In [8]:
FREQWORDS = set([w for (w, wc) in cnt.most_common(10)])
def remove_freqwords(text):
    """custom function to remove the frequent words"""
    return " ".join([word for word in str(text).split() if word not in FREQWORDS])

df["text_wo_stopfreq"] = df["text_wo_stop"].apply(lambda text: remove_freqwords(text))
df.head()

Unnamed: 0,text,text_wo_punct,text_wo_stop,text_wo_stopfreq
0,@AppleSupport causing the reply to be disregar...,AppleSupport causing the reply to be disregard...,AppleSupport causing reply disregarded tapped ...,causing reply disregarded tapped notification ...
1,@105835 Your business means a lot to us. Pleas...,105835 Your business means a lot to us Please ...,105835 Your business means lot us Please DM na...,105835 Your business means lot Please name zip...
2,@76328 I really hope you all change but I'm su...,76328 I really hope you all change but Im sure...,76328 I really hope change Im sure wont Becaus...,76328 really hope change Im sure wont Because ...
3,@105836 LiveChat is online at the moment - htt...,105836 LiveChat is online at the moment https...,105836 LiveChat online moment httpstcoSY94VtU8...,105836 LiveChat online moment httpstcoSY94VtU8...
4,@VirginTrains see attached error message. I've...,VirginTrains see attached error message Ive tr...,VirginTrains see attached error message Ive tr...,VirginTrains see attached error message Ive tr...


## Rare Words Removal

In [9]:
df.drop(["text_wo_punct", "text_wo_stop"], axis=1, inplace=True)

n_rare_words = 10
RAREWORDS = set([w for (w, wc) in cnt.most_common()[:-n_rare_words-1:-1]])
def remove_rarewords(text):
    """custom function to remove the rare words"""
    return " ".join([word for word in str(text).split() if word not in RAREWORDS])

df["text_wo_stopfreqrare"] = df["text_wo_stopfreq"].apply(lambda text: remove_rarewords(text))
df.head()

Unnamed: 0,text,text_wo_stopfreq,text_wo_stopfreqrare
0,@AppleSupport causing the reply to be disregar...,causing reply disregarded tapped notification ...,causing reply disregarded tapped notification ...
1,@105835 Your business means a lot to us. Pleas...,105835 Your business means lot Please name zip...,105835 Your business means lot Please name zip...
2,@76328 I really hope you all change but I'm su...,76328 really hope change Im sure wont Because ...,76328 really hope change Im sure wont Because ...
3,@105836 LiveChat is online at the moment - htt...,105836 LiveChat online moment httpstcoSY94VtU8...,105836 LiveChat online moment httpstcoSY94VtU8...
4,@VirginTrains see attached error message. I've...,VirginTrains see attached error message Ive tr...,VirginTrains see attached error message Ive tr...


## Emojis Removal

In [10]:
def remove_emoji(string):
    'src: https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b'
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

remove_emoji("game is on 🔥🔥")

'game is on '

## Emoticons Removal

In [11]:
# src : https://github.com/NeelShah18/emot/blob/master/emot/emo_unicode.py
EMOTICONS = {
    u":‑\)":"Happy face or smiley",
    u":\)":"Happy face or smiley",
    u":-\]":"Happy face or smiley",
    u":\]":"Happy face or smiley",
    u":-3":"Happy face smiley",
    u":3":"Happy face smiley",
    u":->":"Happy face smiley",
    u":>":"Happy face smiley",
    u"8-\)":"Happy face smiley",
    u":o\)":"Happy face smiley",
    u":-\}":"Happy face smiley",
    u":\}":"Happy face smiley",
    u":-\)":"Happy face smiley",
    u":c\)":"Happy face smiley",
    u":\^\)":"Happy face smiley",
    u"=\]":"Happy face smiley",
    u"=\)":"Happy face smiley"
}

In [12]:
def remove_emoticons(text):
    emoticon_pattern = re.compile(u'(' + u'|'.join(k for k in EMOTICONS) + u')')
    return emoticon_pattern.sub(r'', text)

remove_emoticons("Hello :-)")

'Hello '

## Conversion of Emoticons/Emojis to Words

In [13]:
def convert_emoticons(text):
    for emot in EMOTICONS:
        text = re.sub(u'('+emot+')', "_".join(EMOTICONS[emot].replace(",","").split()), text)
    return text

text = "Hello :-) :-)"
convert_emoticons(text)

'Hello Happy_face_smiley Happy_face_smiley'

## URLs Removal

In [14]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

In [15]:
text = "Check the documentation at https://docs.python.org/3/"
remove_urls(text)

'Check the documentation at '