In [1]:
import nltk
import contractions
import inflect
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from bs4 import BeautifulSoup
import re, string, unicodedata
import pandas as pd
import emoji
from langdetect import detect
import numpy as np

[nltk_data] Downloading package punkt to /home/adel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/adel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/adel/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Make Sure Text is English 

In [3]:
def is_english(text):
    try:
        lang = detect(text)
        return lang == 'en'
    except:
        return False


# Removing Noise   

The first step comes with removing the noises in the data; here in the text domain, noise is referred to as something which not related to textual human language, and those come with various nature like special characters, use of parentheses, use of square brackets, white spaces, URL’s and punctuations.    

In [4]:
# to remove HTML tag
def html_remover(data):
  beauti = BeautifulSoup(data,'html.parser')
  # print('html_remover out type :', type(beauti.get_text()))
  return beauti.get_text()

# to remove URL
def url_remover(data):
  # print('html_remover out type :', type(re.sub(r'https\S','',data)))
  return re.sub(r'https\S','',data)

def web_associated(data):
  text = html_remover(data)
  text = url_remover(text)
  return text

# new_data = web_associated(data)


After removing the HTML tag and URL, there is still some noise in the form of punctuations and white spaces, and text data under the parenthesis; this need to be also treated;

In [5]:
def remove_round_brackets(data):
  # print('remove_round_brackets :', type(re.sub('\(.*?\)','',data)))
  return re.sub('\(.*?\)','',data)

def remove_punc(data):
  trans = str.maketrans('','', string.punctuation)
  # print('remove_punc:', type(data.translate(trans)))
  return data.translate(trans)

def white_space(data):
  # print('white_space:', type(' '.join(data.split())))
  return ' '.join(data.split())

def complete_noise(data):
  new_data = remove_round_brackets(data)
  new_data = remove_punc(new_data)
  new_data = white_space(new_data)
  return new_data

# new_data = complete_noise(new_data)

# Removing Emojis

In [6]:
def remove_emojis(text):
    return emoji.demojize(text, delimiters=(' ', ' '))


# Normalizing text  

Usually, text normalisation starts with tokenizing the text, which our longer corpus is now to be split into chunks of words, which the tokenizer class from NLTK can do. Post that, we need to lower case each word of our corpus, converting numbers to the words and last with contraction replacement. 

In [7]:
def text_lower(data):
  # print('text_lower:', type(data.lower()))
  return data.lower()

def contraction_replace(data):
  # print('contraction_replace:', type(contractions.fix(data)))
  return contractions.fix(data)

def number_to_text(data):
  temp_str = data.split()
  string = []
  for i in temp_str:
    # if the word is digit, converted to 
    # word else the sequence continues
    if i.isdigit():
      temp = inflect.engine().number_to_words(i)
      string.append(temp)
    else:
      string.append(i)
  # print('number_to_text:', type(temp_str))
  return temp_str

def normalization(data):
  text = text_lower(data)
  text = number_to_text(text)
  text = contraction_replace(" ".join(text))
  tokens = nltk.word_tokenize(text)
  # print('normalization:', type(tokens))
  return tokens

# tokens = normalization(new_data)
# print(tokens)

# Stop Words, Stemming or lemmatization

stopwords have no meaning at all; it is just used for decorative purposes. Therefore, further to reduce dimensionality, it is necessary to remove stopwords from the corpus.  

In the end, we have two choices to represent our corpus in the form of stemming or lemmatized words. Stemming usually tries to convert the word into its root format, and mostly it is being carried out by simply cutting words. Where lemmatization also does the task as stemming but in the proper way means it converts the word into roots format like ‘scenes’ will be converted to ‘scene’. One can choose between stemming and lemmatized words. 


In [8]:
def stopword(data):
  clean = []
  for i in data:
    if i not in stopwords.words('english'):
      clean.append(i)
  # print('stopword:', type(clean))
  return clean

def stemming(data):
  stemmer = LancasterStemmer()
  stemmed = []
  for i in data:
    stem = stemmer.stem(i)
    stemmed.append(stem)
  # print('stemming:', type(stemmed))
  return stemmed

def lemmatization(data):
  lemma = WordNetLemmatizer()
  lemmas = []
  for i in data:
    lem = lemma.lemmatize(i, pos='v')
    lemmas.append(lem)
  # print('lemmatization:', type(lemmas))
  return lemmas  

def final_process(data):
  stopwords_remove = stopword(data)
  stemmed = stemming(stopwords_remove)
  lemm = lemmatization(stopwords_remove)
  return stemmed, lemm
# stem,lemmas = final_process(tokens)

In [9]:
def pre_process_pipeline(df):
    # print(df)
    # print(type(df))
    if is_english(df):
        new_data = web_associated(df)
        new_data = complete_noise(new_data)
        new_data = remove_emojis(new_data)
        tokens = normalization(new_data)
        stem,lemmas = final_process(tokens)

        return lemmas
    else:
        return np.nan

In [10]:
# read the data frame
data = pd.read_csv("Africa_task1.1_final.csv")

In [None]:
data['tweet_'] = data['tweet'].apply(pre_process_pipeline)
data['most_popular_reply_'] = data['most_popular_reply'].apply(pre_process_pipeline)
data['second_most_popular_reply_'] = data['second_most_popular_reply'].apply(pre_process_pipeline)

In [15]:
data.columns

Index(['username', 'screen_name', 'tweet', 'reply_count', 'like_count',
       'quote_count', 'retweet_count', 'created_at_date', 'most_popular_reply',
       'most_popular_reply_likes', 'second_most_popular_reply',
       'second_most_popular_reply_likes', 'gender', 'country', 'tweet_',
       'most_popular_reply_', 'second_most_popular_reply_'],
      dtype='object')

In [18]:
data.drop(['tweet','most_popular_reply', 'second_most_popular_reply'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  h_data.drop(['tweet','most_popular_reply', 'second_most_popular_reply'], axis=1, inplace=True)


In [19]:
data.rename(columns={'tweet_':'tweet', 'most_popular_reply_':'most_popular_reply', 'second_most_popular_reply_':'second_most_popular_reply'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  h_data.rename(columns={'tweet_':'tweet', 'most_popular_reply_':'most_popular_reply', 'second_most_popular_reply_':'second_most_popular_reply'}, inplace=True)


In [20]:
data = data[['username', 'screen_name', 'tweet', 'reply_count', 'like_count',
       'quote_count', 'retweet_count', 'created_at_date', 'most_popular_reply',
       'most_popular_reply_likes', 'second_most_popular_reply',
       'second_most_popular_reply_likes', 'gender', 'country']]