# [Workshop] Textual Knowledge Processing

<img src="ws_img_001.png">

# 0. Package Installation (one time job)

In [7]:
#pip install -U spacy

In [8]:
#!python -m spacy download en_core_web_sm

In [9]:
 #!pip install -U nltk

In [10]:
#import nltk
#nltk.download() # Select and Download the "popular" from "Collections"

# 1. Import Library

In [11]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.corpus import wordnet
import spacy

# 2. Text Preprocessing

## 2.1 Lower casing

In [12]:
def lower_casing(sentence):
    # Quiz: How to implement this function without using str.lower()?
    new_sentence = sentence.lower()
    return new_sentence

## 2.2 Abbreviation expansion

In [13]:
def expand_abbriviation(sentence):
    replacement_patterns = [
        (r'won\'t', 'will not'),
        (r'can\'t', 'cannot'),
        (r'i\'m', 'i am'),
        (r'ain\'t', 'is not'),
        (r'(\w+)\'ll', '\g<1> will'),
        (r'(\w+)n\'t', '\g<1> not'),
        (r'(\w+)\'ve', '\g<1> have'),
        (r'(\w+)\'s', '\g<1> is'),
        (r'(\w+)\'re', '\g<1> are'),
        (r'(\w+)\'d', '\g<1> would')]
    patterns = [(re.compile(regex), repl) for (regex, repl) in replacement_patterns]

    new_sentence = sentence
    for (pattern, repl) in patterns:
        (new_sentence, count) = re.subn(pattern, repl, new_sentence)
    return new_sentence

## 2.3 Punctuation removal

In [14]:
def punctuation_removal(sentence):
    # Remove the all the punctuations except '
    new_sentence = re.sub(',|!|\?|\"|<|>|\(|\)|\[|\]|\{|\}|@|#|\+|\=|\-|\_|~|\&|\*|\^|%|\||\$|/|`|\.|\'',
                          '', sentence,count=0, flags=0)
    return new_sentence

## 2.4 Sentence tokenization

In [15]:
def tokenization(sentence):
    new_sentence = nltk.word_tokenize(sentence)
    return new_sentence

## 2.5 Stopwords removal

In [16]:
def stopword_removal(sentence):
    #stoplist = stopwords.words('english')
     
    with open('./stopwords.txt') as file:
        stoplist = [stopword.replace('\n', '').lower() for stopword in file.readlines()]
    
    new_sentence = [word for word in sentence if word not in stoplist]
    return new_sentence

## 2.6 Lemmatization

In [17]:
def get_wordnet_pos(word):
    pack = nltk.pos_tag([word])
    tag = pack[0][1]
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV

    else:
        return None


def lemmatization(sentence):
    lemmatizer = nltk.stem.WordNetLemmatizer()

    new_sentence = [lemmatizer.lemmatize(word, get_wordnet_pos(word) or wordnet.NOUN) for word in sentence]

    return new_sentence

## 2.7 Spelling correction (Optional)

In [18]:
# Package may be used in this section
import nltk
from nltk.corpus import words
from nltk.metrics.distance import jaccard_distance
from nltk.util import ngrams

# Spell correction is also important in text preprocessing.
# Please refer to the Day2 slides and see how jaccard_distance works
def spell_correction(sentence):
    new_sentence = None
    ##############################
    # Your code here
    ##############################
    return new_sentence

# example
s = ['I', 'met', 'my', 'boy', 'frienss', 'yesterdsy']
print(spell_correction(s))

None


## 2.8. Integrate all the functions

In [19]:
def text_preprocessing(raw_sentence):
    sentence = lower_casing(raw_sentence)
    sentence = expand_abbriviation(sentence)
    sentence = punctuation_removal(sentence)
    sentence = tokenization(sentence)
    sentence = stopword_removal(sentence)
    sentence = lemmatization(sentence)
#     sentence = spell_correction(sentence) # Spelling check
    return sentence

# 3. Lets have a try

In [20]:
# A Common text pre-processing procedure is as follow:
# Raw text -> Lower casing -> Expand abbr -> Punctuation removal->...
# ... -> Tokenization  -> Stop word removal -> Lemmatization
# All the functions are defined in above code blocks. Please feel free to go through it
# and change some of the codes.

# Let's assume we have a raw sentence.
raw_sentence = 'He said, "we\'d have eaten more than 100 hamburgers from yesterdsy."'

# Can you guess the result of procedure?
sentence = text_preprocessing(raw_sentence)
print(sentence)

['eat', '100', 'hamburger', 'yesterdsy']


In [21]:
# Or if we only want to use some of the preprocessing techniques, we can call the function separately.
sentence = lower_casing(raw_sentence)
sentence = punctuation_removal(sentence)
sentence = expand_abbriviation(sentence)
sentence = tokenization(sentence)
sentence = stopword_removal(sentence)
sentence = lemmatization(sentence)
# sentence = spell_correction(sentence) # Spelling check
print(sentence)

['eat', '100', 'hamburger', 'yesterdsy']


In [22]:
# In the workshop folder, we have a database called questionbase_raw.txt. Try to import the txt file
# into the Python and do the preprocessing for all the sentence. See what will happen!

with open('./questionbase_raw.txt') as file:
    raw_sentences = [sentence.replace('\n', '') for sentence in file.readlines()]

i = 1
for raw_sentence in raw_sentences:
    processed_sentence = text_preprocessing(raw_sentence)
    if raw_sentence != 'Q' and raw_sentence != 'A':
        print(i, raw_sentence)
        print(i, processed_sentence)
        i += 1
        print()

1 Hello
1 []

2 Hello, I am ASD knowledge bot. Feel free to ask me anything about autism spectrum disorder (ASD).
2 ['asd', 'knowledge', 'bot', 'feel', 'free', 'autism', 'spectrum', 'disorder', 'asd']

3 What is definition of Autistic Spectrum Disorder?
3 ['definition', 'autistic', 'spectrum', 'disorder']

4 Autism, or autism spectrum disorder (ASD), refers to a broad range of conditions characterized by challenges with social skills, repetitive behaviors, speech and nonverbal communication. According to the Centers for Disease Control, autism affects an estimated 1 in 54 children in the United States today.
4 ['autism', 'autism', 'spectrum', 'disorder', 'asd', 'refers', 'broad', 'range', 'condition', 'characterize', 'challenge', 'social', 'skill', 'repetitive', 'behavior', 'speech', 'nonverbal', 'communication', 'center', 'disease', 'control', 'autism', 'estimate', '1', '54', 'child', 'united']

5 What are the symptoms of Autistic Spectrum Disorder?
5 ['symptom', 'autistic', 'spectrum

# 3. Text Preprocessing [Workshop]
### Spacy (Python package) is modern and powerful NLP tool. 
### You can use spacy functions do most of the preprocessing work.

In [23]:
# Create the nlp tool
nlp = spacy.load('en_core_web_sm')

# Raw sentence
raw_sentence = 'He said, "we\'d have eaten more than 100 hamburgers from yesterdsy."'
print("raw_sentence : ", raw_sentence)

raw_sentence :  He said, "we'd have eaten more than 100 hamburgers from yesterdsy."


In [24]:
# Use SpaCy nlp tool to process the raw sentence
token_sentence = nlp(raw_sentence)
for token in token_sentence:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
          token.shape_, token.is_alpha, token.is_stop)

He -PRON- PRON PRP nsubj Xx True True
said say VERB VBD ROOT xxxx True False
, , PUNCT , punct , False False
" " PUNCT `` punct " False False
we -PRON- PRON PRP nsubj xx True True
'd 'd VERB MD aux 'x False True
have have AUX VB aux xxxx True True
eaten eat VERB VBN ccomp xxxx True False
more more ADJ JJR amod xxxx True True
than than SCONJ IN quantmod xxxx True True
100 100 NUM CD nummod ddd False False
hamburgers hamburger NOUN NNS dobj xxxx True False
from from ADP IN prep xxxx True True
yesterdsy yesterdsy PROPN NNP pobj xxxx True False
. . PUNCT . punct . False False
" " PUNCT '' punct " False False


In [25]:
token_sentence[0].text

'He'

In [26]:
token_sentence[0].pos_

'PRON'

### Your hands-on time now:

In [27]:
# Used the given information to implement your own text preprocessing
def my_preprocessing(raw_sentence, nlp_tool):
    token_sentence = nlp_tool(raw_sentence)
    preprocessed_sentence = None
    
    #########################
    # Your code here
    # You should ignore the abbreviation expanding part here since those abbr words
    # are identified as stop words in spacy.
    # 
    # But also consider how to process abbr like U.K. and US such words. 
    # (hint: word2vec, not mentioned in this course)
    
    preprocessed_sentence =[]
    for token in token_sentence:
        if token.lemma_=='-PRON-':
            continue
        elif token.pos_=='PUNCT':
            continue
        elif token.is_stop:
            continue
        else:
            preprocessed_sentence.append(token.lemma_.lower())
    
    #########################

    
    return preprocessed_sentence

In [28]:
# Similarly, call my_preprocessing to process all the text data and
# see what's the difference between the two functions

with open('./questionbase_raw.txt') as file:
    raw_sentences = [sentence.replace('\n', '') for sentence in file.readlines()]
    preprocessed_sentences = None # result from 'my_preprocessing'
    given_preprocessed_sentences = None # result by running the 'text_preprocessing'
    
    #########################
    # Your code here
    nlp_tool = spacy.load('en_core_web_sm')
    preprocessed_sentences = []
    given_preprocessed_sentences= []

    for raw_sentence in raw_sentences:
        if raw_sentence == 'Q' or raw_sentence == 'A':
            continue
        else:
            preprocessed_sentences.append(text_preprocessing(raw_sentence))
            given_preprocessed_sentences.append(my_preprocessing(raw_sentence, nlp_tool))
    #########################
    
    
    
    i = 1
    for raw_sentence in raw_sentences:
        if raw_sentence == 'Q' or raw_sentence == 'A':
            continue
        else:
            print(i, raw_sentence)
            print(i, '=>> text prep output:\n', preprocessed_sentences[i-1])
            print(i, '=>> my spacy  output:\n', given_preprocessed_sentences[i-1])
            i += 1
            print()

1 Hello
1 =>> text prep output:
 []
1 =>> my spacy  output:
 ['hello']

2 Hello, I am ASD knowledge bot. Feel free to ask me anything about autism spectrum disorder (ASD).
2 =>> text prep output:
 ['asd', 'knowledge', 'bot', 'feel', 'free', 'autism', 'spectrum', 'disorder', 'asd']
2 =>> my spacy  output:
 ['hello', 'asd', 'knowledge', 'bot', 'feel', 'free', 'ask', 'autism', 'spectrum', 'disorder', 'asd']

3 What is definition of Autistic Spectrum Disorder?
3 =>> text prep output:
 ['definition', 'autistic', 'spectrum', 'disorder']
3 =>> my spacy  output:
 ['definition', 'autistic', 'spectrum', 'disorder']

4 Autism, or autism spectrum disorder (ASD), refers to a broad range of conditions characterized by challenges with social skills, repetitive behaviors, speech and nonverbal communication. According to the Centers for Disease Control, autism affects an estimated 1 in 54 children in the United States today.
4 =>> text prep output:
 ['autism', 'autism', 'spectrum', 'disorder', 'asd', '

In [None]:
# Compare the two results and explain which one is better and why?
# Provide your answer here
# 1. lower casing ： both of the methods do well on this part as python already support this function
# 2. Punctuation removal : spaCy dose better than the function in text_preprocessing, because the function punctuation_removal is built by a customized punctuation set which might not contains all punctuation
# 3. Expand abbreviation: both of them do well in this case, but spaCy might be better when processing large amounts of text data, beacause function expand_abbriviation only defined abbreviation for this case.
# 4. Tokenization: both of them are doing well.
# 5. Stop word removal: both of them are doing well, but spaCy might be better for processing large data as we need to manually update the stopwords dataset.
# 6. Lemmatization: both are doing well.
# Summary : When processing the text data in a certain area which has a strong demand on customization, the 1st method (text_preprocessing) might be better. When processing massive data without strong demand on customization, we can use the 2nd method.

---
`The end is called a new start.` --- ISS: **I** **S**(elf) **S**(tudy)

---