In [15]:
import numpy as np
import pandas as pd
import string
import re

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer

In [16]:
def remove_punch(text):
    x = str(text)
    for p in string.punctuation:
        x = x.replace(p,'')
    return x

def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

def lower_tokens(tokens):
    return  [w.lower() for w in tokens]

def remove_stopwords(tokens):
    stop = set(stopwords.words("english"))
    filtered_words = [word for word in tokens if word not in stop]
    return " ".join(filtered_words)

def remove_names(text):
    for word in text.split():
        if word[0] == "@":
            text = text.replace(word, "")
    return text

def remove_url(text):
    result = re.sub(r"http\S+", "", text)
    result = re.sub(r"https\S+", "", text)
    return result


def decode_unicode(text):
    text = text.decode('utf-8')
    return text


def text_to_wordlist(text,stem_words=True):
    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"doesn't", "does not ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # remove consecutive letters to single letter at end
    text = re.sub(r'(.)\1+$', r'\1', text)
    
    # Return a list of words
    return(text)

In [17]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\8888\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [19]:
# df = pd.read_csv('en_Hasoc2021_train.csv')
df = pd.read_csv('en_Hasoc2021_test_task1.csv')

In [20]:
df['text_clean'] = df['text'].apply(lambda text:remove_names(text))
df['text_clean'] = df['text_clean'].apply(lambda text:remove_url(text))
df['text_clean'] = df['text_clean'].apply(lambda text:text_to_wordlist(text))

In [21]:
df['text_clean'] = df['text_clean'].apply(lambda text:deEmojify(text))
df['text_clean'] = df['text_clean'].apply(lambda text:remove_punch(text))

In [22]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\8888\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [23]:
tokens = [word_tokenize(tweet) for tweet in df['text_clean']]
lower = [lower_tokens(token) for token in tokens]

In [24]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\8888\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [25]:
removed = []
for tokens in lower:
    removed.append(remove_stopwords(tokens))

In [26]:
df['text_clean'] = pd.Series(removed)

In [27]:
df.head()

Unnamed: 0,_id,text,text_clean
0,60c5d6bf5659ea5e55deffcb,Fewer people coming in for vaccinations. So sa...,fewer peopl come vaccin sad nurs covidvaccin vumc
1,60c5d6bf5659ea5e55df028c,@MattHancock This may all be true. But... What...,may true piss big dom
2,60c5d6bf5659ea5e55def377,@Layla_EFC I’ve unfollowed him the wanker,unfollow wanker
3,60c5d6bf5659ea5e55def4c7,You guys are losing it all over the world. The...,guy lose world jung label islamophobia
4,60c5d6bf5659ea5e55df01a6,"And thus death laughs... It is sad merriment, ...",thus death laugh sad merriment still covid ind...


In [28]:
# df.to_csv('preprocess_data_analysis.csv')
df.to_csv('preprocess_test_data.csv')