In [18]:
import pandas as pd
import numpy as np
import nltk
from bs4 import BeautifulSoup
from html.parser import HTMLParser
import string
from nltk import word_tokenize, pos_tag, ne_chunk, pos_tag_sents
from nltk.corpus import stopwords

from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

import re
from nltk.util import ngrams

from nltk import RegexpParser
from nltk import Tree

nltk.download('stopwords')
stopwordList = stopwords.words('english')

nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [19]:
df = pd.read_csv('pnlp_data_en.csv', delimiter=';')
df.dropna()
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_colwidth', 50)

# rename columns for easier usability
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
del df['report_grouping']
del df['question_text']
# preview
df.head(5)

Unnamed: 0,comments
0,"we do what our customers need, we communicate ..."
1,Customs business development continues to grow...
2,"I think the team work hard, are committed to c..."
3,Overall working towards a customer centric env...
4,Customer centricity is a growing culture in th...


# Basic feature extraction from Dataset

In [20]:
# Total number of words per row
feature_df = df.copy()
feature_df['total_words'] = feature_df['comments'].apply(lambda x: len(str(x).split(" ")))
feature_df[['comments','total_words']].head(5)

Unnamed: 0,comments,total_words
0,"we do what our customers need, we communicate ...",9
1,Customs business development continues to grow...,28
2,"I think the team work hard, are committed to c...",19
3,Overall working towards a customer centric env...,17
4,Customer centricity is a growing culture in th...,15


In [21]:
# Total number of characters (including space) per row
feature_df['total_char'] = feature_df['comments'].str.len()
feature_df[['comments','total_char']].head(5)

Unnamed: 0,comments,total_char
0,"we do what our customers need, we communicate ...",60
1,Customs business development continues to grow...,161
2,"I think the team work hard, are committed to c...",107
3,Overall working towards a customer centric env...,117
4,Customer centricity is a growing culture in th...,100


In [22]:
# Total number of stopwords per row
feature_df['stopwords'] = feature_df['comments'].apply(lambda x: len([x for x in x.split() if x in stopwordList]))
feature_df[['comments','stopwords']].head(5)

Unnamed: 0,comments,stopwords
0,"we do what our customers need, we communicate ...",5
1,Customs business development continues to grow...,13
2,"I think the team work hard, are committed to c...",8
3,Overall working towards a customer centric env...,5
4,Customer centricity is a growing culture in th...,6


In [23]:
# Total number of punctuation or special characters per row
feature_df['total_punc'] = feature_df['comments'].apply(lambda x: len([x for x in x.split() if x in string.punctuation]))
feature_df[['comments','total_punc']].head(5)

Unnamed: 0,comments,total_punc
0,"we do what our customers need, we communicate ...",0
1,Customs business development continues to grow...,1
2,"I think the team work hard, are committed to c...",0
3,Overall working towards a customer centric env...,0
4,Customer centricity is a growing culture in th...,0


In [24]:
# Total number of numerics per row
feature_df['total_num'] = feature_df['comments'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
feature_df[['comments','total_num']].head(5)

Unnamed: 0,comments,total_num
0,"we do what our customers need, we communicate ...",0
1,Customs business development continues to grow...,0
2,"I think the team work hard, are committed to c...",0
3,Overall working towards a customer centric env...,0
4,Customer centricity is a growing culture in th...,0


In [25]:
# Total number of Uppercase word per row
feature_df['total_uppercase'] = feature_df['comments'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
feature_df[['comments','total_uppercase']].head(5)

Unnamed: 0,comments,total_uppercase
0,"we do what our customers need, we communicate ...",0
1,Customs business development continues to grow...,0
2,"I think the team work hard, are committed to c...",1
3,Overall working towards a customer centric env...,0
4,Customer centricity is a growing culture in th...,0


In [0]:
# Apostrophe Dictionary and if you have more words in mind, please add it in the bottom
apostrophe = {
    "aren't" : "are not",
    "can't" : "cannot",
    "couldn't" : "could not",
    "didn't" : "did not",
    "doesn't" : "does not",
    "don't" : "do not",
    "hadn't" : "had not",
    "hasn't" : "has not",
    "haven't" : "have not",
    "he'd" : "he would",
    "he'll" : "he will",
    "he's" : "he is",
    "i'd" : "I would",
    "i'd" : "I had",
    "i'll" : "I will",
    "i'm" : "I am",
    "isn't" : "is not",
    "it's" : "it is",
    "it'll":"it will",
    "i've" : "I have",
    "let's" : "let us",
    "mightn't" : "might not",
    "mustn't" : "must not",
    "shan't" : "shall not",
    "she'd" : "she would",
    "she'll" : "she will",
    "she's" : "she is",
    "shouldn't" : "should not",
    "that's" : "that is",
    "there's" : "there is",
    "they'd" : "they would",
    "they'll" : "they will",
    "they're" : "they are",
    "they've" : "they have",
    "we'd" : "we would",
    "we're" : "we are",
    "weren't" : "were not",
    "we've" : "we have",
    "what'll" : "what will",
    "what're" : "what are",
    "what's" : "what is",
    "what've" : "what have",
    "where's" : "where is",
    "who'd" : "who would",
    "who'll" : "who will",
    "who're" : "who are",
    "who's" : "who is",
    "who've" : "who have",
    "won't" : "will not",
    "wouldn't" : "would not",
    "you'd" : "you would",
    "you'll" : "you will",
    "you're" : "you are",
    "you've" : "you have",
    "'re": " are",
    "wasn't": "was not",
    "we'll":" will",
    "didn't": "did not",
    "'s": "is",
    "'re": "are"
}

#Short words dictionary and if have more words in mind, please add it in the bottom
short_words = {
"121": "one to one",
"a/s/l": "age, sex, location",
"adn": "any day now",
"afaik": "as far as I know",
"afk": "away from keyboard",
"aight": "alright",
"alol": "actually laughing out loud",
"b4": "before",
"b4n": "bye for now",
"bak": "back at the keyboard",
"bf": "boyfriend",
"bff": "best friends forever",
"bfn": "bye for now",
"bg": "big grin",
"bta": "but then again",
"btw": "by the way",
"cid": "crying in disgrace",
"cnp": "continued in my next post",
"cp": "chat post",
"cu": "see you",
"cul": "see you later",
"cul8r": "see you later",
"cya": "bye",
"cyo": "see you online",
"dbau": "doing business as usual",
"fud": "fear, uncertainty, and doubt",
"fwiw": "for what it's worth",
"fyi": "for your information",
"g": "grin",
"g2g": "got to go",
"ga": "go ahead",
"gal": "get a life",
"gf": "girlfriend",
"gfn": "gone for now",
"gmbo": "giggling my butt off",
"gmta": "great minds think alike",
"h8": "hate",
"hagn": "have a good night",
"hdop": "help delete online predators",
"hhis": "hanging head in shame",
"iac": "in any case",
"ianal": "I am not a lawyer",
"ic": "I see",
"idk": "I don't know",
"imao": "in my arrogant opinion",
"imnsho": "in my not so humble opinion",
"imo": "in my opinion",
"iow": "in other words",
"ipn": "I’m posting naked",
"irl": "in real life",
"jk": "just kidding",
"l8r": "later",
"ld": "later, dude",
"ldr": "long distance relationship",
"llta": "lots and lots of thunderous applause",
"lmao": "laugh my ass off",
"lmirl": "let's meet in real life",
"lol": "laugh out loud",
"ltr": "longterm relationship",
"lulab": "love you like a brother",
"lulas": "love you like a sister",
"luv": "love",
"m/f": "male or female",
"m8": "mate",
"milf": "mother I would like to fuck",
"oll": "online love",
"omg": "oh my god",
"otoh": "on the other hand",
"pir": "parent in room",
"ppl": "people",
"r": "are",
"rofl": "roll on the floor laughing",
"rpg": "role playing games",
"ru": "are you",
"shid": "slaps head in disgust",
"somy": "sick of me yet",
"sot": "short of time",
"thanx": "thanks",
"thx": "thanks",
"ttyl": "talk to you later",
"u": "you",
"ur": "you are",
"uw": "you’re welcome",
"wb": "welcome back",
"wfm": "works for me",
"wibni": "wouldn't it be nice if",
"wtf": "what the fuck",
"wtg": "way to go",
"wtgp": "want to go private",
"ym": "young man",
"gr8": "great"
}

def apos_short_dict(text, dictionary):
    for word in text.split():
        if word.lower() in dictionary:
            if word.lower() in text.split():
                text = text.replace(word, dictionary[word.lower()])
    return text

# **Basic Pre-processing**

In [27]:
def remove_html(txt):
    '''Remove HTML'''
    txt = BeautifulSoup(txt, 'lxml')
    return txt.get_text()

def remove_punctuation(surveyText):
    '''Remove any punctuation'''
    return "".join([i for i in surveyText if i not in string.punctuation])

def remove_stopwords(surveyText):
    '''Remove stop words'''
    return [w for w in surveyText if w not in stopwordList]

def word_lemmatizer(surveyText):
    '''Lemmatize words'''
    lemmatizer = WordNetLemmatizer()
    w_tokenizer = WhitespaceTokenizer()
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(surveyText)]

def word_stemmer(surveyText):
    '''Stem word'''
    stemmer = PorterStemmer()
    return [stemmer.stem(i) for i in surveyText]

# Function to generate n-grams from sentences.
def extract_ngrams(txt, num=3):
    n_grams = ngrams(nltk.word_tokenize(txt), num)
    return [ ' '.join(grams) for grams in n_grams]
    
'''
def pos_tags(txt):
  
  tokenized = sent_tokenize(txt) 
  for i in tokenized: 
      
    # Word tokenizers is used to find the words  
    # and punctuation in a string 
    wordsList = nltk.word_tokenize(i) 
  
    # removing stop words from wordList 
    wordsList = [w for w in wordsList if not w in stop_words]  
  
    #  Using a Tagger. Which is part-of-speech  
    # tagger or POS-tagger.  
    tagged = nltk.pos_tag(wordsList) 
    return [ ' '.join(tags) for tags in tagged]
    #print(tagged)

'''

'''
    Control the parameter by putting value of TRUE or FALSE according to requirements. 
    Args : txt - Provided text for preprocessing
            punctuation - Remove all punctuation, Initially value = False
            tokenize - Splitting long text into smaller lines
            stopwords - Remove such words which does not have much meaning to a line of text
            correct_apos - Remove apostrophe
            shortwords - Convert any short word to full meaningfull word
            specialCharacter - Replace all specialCharacter
            numbers - Remove numbers
            singleChar - Removing words whom length is one
            lematization - Lematize text
            stemming - Stemming any text
'''
def preprocessing(txt, punctuation= False, tokenize= False, stopwords= False, correct_apos= False, 
                  shortWords= False, specialCharacter= False, numbers= False, singleChar= False,
                 lematization= False, stemming= False, ngrams=False):
    
    cleanedTxt = txt.apply(lambda x: remove_html(x))
    
    if punctuation:
        cleanedTxt = cleanedTxt.apply(lambda x:remove_punctuation(x))
    
    #if spellCheck:
        #cleanedTxt = cleanedTxt.apply(lambda x: spell_correction(x))
        
    if tokenize:
        cleanedTxt = cleanedTxt.apply(lambda x:word_tokenize(x.lower()))
        
    if stopwords:
        cleanedTxt = cleanedTxt.apply(lambda x: remove_stopwords(x))
        
    if correct_apos:
        cleanedTxt = cleanedTxt.apply(lambda x: apos_short_dict(str(x),apostrophe))
        
    if shortWords:
        cleanedTxt = cleanedTxt.apply(lambda x: apos_short_dict(str(x),short_words))
    
    if specialCharacter:
        '''Replacing Special Characters with space'''
        cleanedTxt = cleanedTxt.apply(lambda x: re.sub(r'[^a-zA-Z0-9]',' ',str(x)))
    
    if numbers:
        '''Replacing Numbers with space'''
        cleanedTxt = cleanedTxt.apply(lambda x: re.sub(r'[^a-zA-Z]',' ',x))
        
    if singleChar:
        '''Removing words whom length is one'''
        cleanedTxt = cleanedTxt.apply(lambda x: ' '.join([w for w in x.split() if len(w)>1]))
    
    if lematization:
        cleanedTxt = cleanedTxt.apply(lambda x: word_lemmatizer(x))
        
    if stemming:
        cleanedTxt = cleanedTxt.apply(lambda x: word_stemmer(x))

    #if ngrams:
        #cleanedTxt = cleanedTxt.apply(lambda x: extract_ngrams(x))
    
    return cleanedTxt
      
df['comments'] = preprocessing(df['comments'], punctuation= True, tokenize= True, stopwords= True, 
                                correct_apos= True, shortWords= True, specialCharacter= True, 
                               numbers= True, singleChar= True, lematization= False, stemming= False, ngrams= True)
df.head(5)

Unnamed: 0,comments
0,customers need communicate aperiodically
1,customs business development continues grow ex...
2,think team work hard committed continuous impr...
3,overall working towards customer centric envir...
4,customer centricity growing culture company cr...


In [28]:
# Sentence tokenization
df['tokenized_sents'] = df['comments'].apply(lambda x: nltk.sent_tokenize(x))
df.head(5)

Unnamed: 0,comments,tokenized_sents
0,customers need communicate aperiodically,[customers need communicate aperiodically]
1,customs business development continues grow ex...,[customs business development continues grow e...
2,think team work hard committed continuous impr...,[think team work hard committed continuous imp...
3,overall working towards customer centric envir...,[overall working towards customer centric envi...
4,customer centricity growing culture company cr...,[customer centricity growing culture company c...


In [29]:
# Word Tokenization
df['tokenized_words'] = df['comments'].apply(lambda x: nltk.word_tokenize(x))
df.head(5)

Unnamed: 0,comments,tokenized_sents,tokenized_words
0,customers need communicate aperiodically,[customers need communicate aperiodically],"[customers, need, communicate, aperiodically]"
1,customs business development continues grow ex...,[customs business development continues grow e...,"[customs, business, development, continues, gr..."
2,think team work hard committed continuous impr...,[think team work hard committed continuous imp...,"[think, team, work, hard, committed, continuou..."
3,overall working towards customer centric envir...,[overall working towards customer centric envi...,"[overall, working, towards, customer, centric,..."
4,customer centricity growing culture company cr...,[customer centricity growing culture company c...,"[customer, centricity, growing, culture, compa..."


In [30]:
# POS tagging
df['POS_Tags'] = pos_tag_sents(df['comments'].apply(word_tokenize).tolist())
df.head(5)

Unnamed: 0,comments,tokenized_sents,tokenized_words,POS_Tags
0,customers need communicate aperiodically,[customers need communicate aperiodically],"[customers, need, communicate, aperiodically]","[(customers, NNS), (need, VBP), (communicate, ..."
1,customs business development continues grow ex...,[customs business development continues grow e...,"[customs, business, development, continues, gr...","[(customs, NNS), (business, NN), (development,..."
2,think team work hard committed continuous impr...,[think team work hard committed continuous imp...,"[think, team, work, hard, committed, continuou...","[(think, NN), (team, NN), (work, NN), (hard, R..."
3,overall working towards customer centric envir...,[overall working towards customer centric envi...,"[overall, working, towards, customer, centric,...","[(overall, JJ), (working, NN), (towards, NNS),..."
4,customer centricity growing culture company cr...,[customer centricity growing culture company c...,"[customer, centricity, growing, culture, compa...","[(customer, NN), (centricity, NN), (growing, V..."


In [32]:
# Term frequency
# TF = (Number of times term T appears in the particular row) / (number of terms in that row)
df1 = (df['comments']).apply(lambda x: pd.value_counts(x.split(" "))).sum(axis = 0).reset_index()
df1.columns = ['words','tf']
df1.head(5)

Unnamed: 0,words,tf
0,customers,918.0
1,communicate,116.0
2,aperiodically,1.0
3,need,2373.0
4,business,650.0


In [34]:
# Inverse Document Frequency
# IDF = log(N/n), where, N is the total number of rows and n is the number of rows in which the word was present
for i,word in enumerate(df1['words']):
  df1.loc[i, 'idf'] = np.log(df1.shape[0]/(len(df1[df1['words'].str.contains(word)])))

df1.head(5)

Unnamed: 0,words,tf,idf
0,customers,918.0,6.506905
1,communicate,116.0,7.710877
2,aperiodically,1.0,9.502637
3,need,2373.0,6.170432
4,business,650.0,7.200052


In [35]:
# Term Frequency – Inverse Document Frequency (TF-IDF)
# TF-IDF = TF * IDF

df1['tf_idf'] = df1['tf'] * df1['idf']
df1.head(5)

Unnamed: 0,words,tf,idf,tf_idf
0,customers,918.0,6.506905,5973.338339
1,communicate,116.0,7.710877,894.461768
2,aperiodically,1.0,9.502637,9.502637
3,need,2373.0,6.170432,14642.435781
4,business,650.0,7.200052,4680.033598


In [36]:
# Chunking

def get_continuous_chunks(text, chunk_func=ne_chunk):
    #print(text)
    #exit()
    chunked = chunk_func(pos_tag(word_tokenize(text)))
    continuous_chunk = []
    current_chunk = []

    for subtree in chunked:
        if type(subtree) == Tree:
            current_chunk.append(" ".join([token for token, pos in subtree.leaves()]))
        elif current_chunk:
            named_entity = " ".join(current_chunk)
            if named_entity not in continuous_chunk:
                continuous_chunk.append(named_entity)
                current_chunk = []
        else:
            continue

    return continuous_chunk

df['comments'].apply(lambda sent: get_continuous_chunks((sent)))
df.head(5)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


Unnamed: 0,comments,tokenized_sents,tokenized_words,POS_Tags
0,customers need communicate aperiodically,[customers need communicate aperiodically],"[customers, need, communicate, aperiodically]","[(customers, NNS), (need, VBP), (communicate, ..."
1,customs business development continues grow ex...,[customs business development continues grow e...,"[customs, business, development, continues, gr...","[(customs, NNS), (business, NN), (development,..."
2,think team work hard committed continuous impr...,[think team work hard committed continuous imp...,"[think, team, work, hard, committed, continuou...","[(think, NN), (team, NN), (work, NN), (hard, R..."
3,overall working towards customer centric envir...,[overall working towards customer centric envi...,"[overall, working, towards, customer, centric,...","[(overall, JJ), (working, NN), (towards, NNS),..."
4,customer centricity growing culture company cr...,[customer centricity growing culture company c...,"[customer, centricity, growing, culture, compa...","[(customer, NN), (centricity, NN), (growing, V..."


In [37]:
# de-tokenization
detokenizer = []
for i in range(len(df)):
    t = ' '.join(df['comments'][i]) # tokenized text
    detokenizer.append(t)

df['comments'] = detokenizer
df.head(5)

Unnamed: 0,comments,tokenized_sents,tokenized_words,POS_Tags
0,c u s t o m e r s n e e d c o m m u n i c ...,[customers need communicate aperiodically],"[customers, need, communicate, aperiodically]","[(customers, NNS), (need, VBP), (communicate, ..."
1,c u s t o m s b u s i n e s s d e v e l o ...,[customs business development continues grow e...,"[customs, business, development, continues, gr...","[(customs, NNS), (business, NN), (development,..."
2,t h i n k t e a m w o r k h a r d c o ...,[think team work hard committed continuous imp...,"[think, team, work, hard, committed, continuou...","[(think, NN), (team, NN), (work, NN), (hard, R..."
3,o v e r a l l w o r k i n g t o w a r d s ...,[overall working towards customer centric envi...,"[overall, working, towards, customer, centric,...","[(overall, JJ), (working, NN), (towards, NNS),..."
4,c u s t o m e r c e n t r i c i t y g r o ...,[customer centricity growing culture company c...,"[customer, centricity, growing, culture, compa...","[(customer, NN), (centricity, NN), (growing, V..."


In [38]:
# Coreference Resolution
# Need to download and extract
  # !wget http://nlp.stanford.edu/software/stanford-corenlp-latest.zip
  # !unzip stanford-corenlp-latest.zip
# After extraction goto the command promt and navigate to the extracted folder and run the below given command.
# Then execute the program
  # java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer

# [Note] Port number is by default 9000 but it can be change.

!pip install pycorenlp
from pycorenlp import StanfordCoreNLP

nlp = StanfordCoreNLP('http://localhost:9000')


def resolve(corenlp_output):
    """ Transfer the word form of the antecedent to its associated pronominal anaphor(s) """
    for coref in corenlp_output['corefs']:
        mentions = corenlp_output['corefs'][coref]
        antecedent = mentions[0]  # the antecedent is the first mention in the coreference chain
        for j in range(1, len(mentions)):
            mention = mentions[j]
            if mention['type'] == 'PRONOMINAL':
                # get the attributes of the target mention in the corresponding sentence
                target_sentence = mention['sentNum']
                target_token = mention['startIndex'] - 1
                # transfer the antecedent's word form to the appropriate token in the sentence
                corenlp_output['sentences'][target_sentence - 1]['tokens'][target_token]['word'] = antecedent['text']


def print_resolved(corenlp_output):
    """ Print the "resolved" output """
    possessives = ['hers', 'his', 'their', 'theirs']
    for sentence in corenlp_output['sentences']:
        for token in sentence['tokens']:
            output_word = token['word']
            # check lemmas as well as tags for possessive pronouns in case of tagging errors
            if token['lemma'] in possessives or token['pos'] == 'PRP$':
                output_word += "'s"  # add the possessive morpheme
            output_word += token['after']
            print(output_word, end='')

for i in range(len(df)):
    output = nlp.annotate(df['comments'].iloc[i], properties= {'annotators':'dcoref','outputFormat':'json','ner.useSUTime':'false'})
    resolve(output)
    
    print('Original:', df['comment'].iloc[i] )
    print('Resolved: ', end='')
    print_resolved(output)
    print('-'*50)


Collecting pycorenlp
  Downloading https://files.pythonhosted.org/packages/cd/40/e74eb4fc7906d630b73a84c9ae9d824f694bd4c5a1d727b8e18beadff613/pycorenlp-0.3.0.tar.gz
Building wheels for collected packages: pycorenlp
  Building wheel for pycorenlp (setup.py) ... [?25l[?25hdone
  Created wheel for pycorenlp: filename=pycorenlp-0.3.0-cp36-none-any.whl size=2143 sha256=d4cb043bf307fbe3c46364b5a6553f5dbac64650730c3e5a8382dfe585d55fc6
  Stored in directory: /root/.cache/pip/wheels/fb/e9/2f/767a7b5f2e82d587a36143c04a21839b4b14bebfb89410d2d5
Successfully built pycorenlp
Installing collected packages: pycorenlp
Successfully installed pycorenlp-0.3.0


Exception: ignored

In [0]:
## TODO ##
# Ngrams
# Extract Collocation
# Named Entity Recognition
# Relationship extraction