In [20]:
import pandas as pd
import re
from string import punctuation as pun
from chat_word_dict import abbreviations
from textblob import TextBlob
from gensim.parsing.preprocessing import STOPWORDS

In [21]:
data = pd.read_csv('data.csv', encoding='utf-8', header=None, names=['target', 'ids', 'date', 'flag', 'user', 'text'])

In [22]:
data.drop(columns=['ids', 'date', 'flag', 'user'], inplace=True)

PreProcessing:

Lower-Casing:

In [23]:
data['text'] = data['text'].str.lower()

Removing Urls:

In [24]:
url_pattern = re.compile(r'https?://\S+|www\.\S+')
data['text'] = data['text'].apply(lambda x: url_pattern.sub(r'', x))

Removing Mentions/ hashtags:

In [25]:
mentions_pattern = re.compile(r'@\S+')
data['text'] = data['text'].apply(lambda x: mentions_pattern.sub(r'', x))

In [26]:
hashs_pattern = re.compile(r'#\S+')
data['text'] = data['text'].apply(lambda x: hashs_pattern.sub(r'', x))

Removing Extra White Spaces:

In [27]:
data['text'] = data['text'].apply(lambda x: re.sub(r'\s+', ' ', x))
data['text'] = data['text'].str.strip()

Removing Puntuation:

In [28]:
pun = pun.replace("'", '')
pun

'!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~'

In [29]:
translator = str.maketrans('', '', pun)                                     # Probably better to drop this step
data['text'] = data['text'].apply(lambda x: x.translate(translator))

Converting Abbreviations:

In [30]:
pattern = r'\b(' + '|'.join(re.escape(key) for key in abbreviations.keys()) + r')\b'
data['text'] = data['text'].apply(lambda x: re.sub(pattern, lambda match: abbreviations[match.group(0).lower()], x))

Speeling Correction:

In [31]:
#data['text'] = data['text'].apply(lambda x: str(TextBlob(x).correct()))

Removing Stopwords:

In [32]:
data['text'] = data['text'].apply(lambda x: ' '.join(word for word in x.split() if word.lower() not in STOPWORDS))

In [33]:
data

Unnamed: 0,target,text
0,0,awww that's bummer shoulda got david carr day d
1,0,upset can't update facebook texting result sch...
2,0,dived times ball managed save 50 rest bounds
3,0,body feels itchy like
4,0,it's behaving i'm mad can't
...,...,...
1599995,4,woke having school best feeling
1599996,4,thewdbcom cool hear old walt interviews ♫
1599997,4,ready mojo makeover ask details
1599998,4,happy 38th birthday boo alll tears eyes tupac ...


In [34]:
data.to_csv('processed_data.csv')