In [1]:
import pandas as pd
import re
from string import punctuation as pun
from chat_word_dict import abbreviations
from textblob import TextBlob
from gensim.parsing.preprocessing import STOPWORDS

In [2]:
data = pd.read_csv('data.csv', encoding='utf-8', header=None, names=['target', 'ids', 'date', 'flag', 'user', 'text'])

In [3]:
data.drop(columns=['ids', 'date', 'flag', 'user'], inplace=True)

In [4]:
data

Unnamed: 0,target,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."
...,...,...
1599995,4,Just woke up. Having no school is the best fee...
1599996,4,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,Happy 38th Birthday to my boo of alll time!!! ...


PreProcessing:

Lower-Casing:

In [5]:
data['text'] = data['text'].str.lower()

Removing Urls:

In [6]:
url_pattern = re.compile(r'https?://\S+|www\.\S+')
data['text'] = data['text'].apply(lambda x: url_pattern.sub(r'', x))

Removing Mentions/ hashtags:

In [7]:
mentions_pattern = re.compile(r'@\S+')
data['text'] = data['text'].apply(lambda x: mentions_pattern.sub(r'', x))

In [8]:
hashs_pattern = re.compile(r'#\S+')
data['text'] = data['text'].apply(lambda x: hashs_pattern.sub(r'', x))

Removing Extra White Spaces:

In [9]:
data['text'] = data['text'].apply(lambda x: re.sub(r'\s+', ' ', x))
data['text'] = data['text'].str.strip()

Removing Puntuation:

In [10]:
pun = pun.replace("'", '')
pun = pun.replace(".", '')
pun

'!"#$%&()*+,-/:;<=>?@[\\]^_`{|}~'

In [11]:
translator = str.maketrans('', '', pun)                                     # Probably better to drop this step
data['text'] = data['text'].apply(lambda x: x.translate(translator))

Converting Abbreviations:

In [12]:
pattern = r'\b(' + '|'.join(re.escape(key) for key in abbreviations.keys()) + r')\b'
data['text'] = data['text'].apply(lambda x: re.sub(pattern, lambda match: abbreviations[match.group(0).lower()], x))

Speeling Correction:

In [31]:
#data['text'] = data['text'].apply(lambda x: str(TextBlob(x).correct()))

Removing Stopwords:

In [13]:
data['text'] = data['text'].apply(lambda x: ' '.join(word for word in x.split() if word.lower() not in STOPWORDS))

Removing emojies:

In [16]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

data['text'] = data['text'].apply(lambda x: remove_emoji(x))


In [19]:
data.to_csv('processed_data.csv', index=False)