# Nested Summarization With Heading Hierarchy: Lexical Processing
### AI 829: NLP Mandate 2 Contribution
By: Rachna S Kedigehalli (IMT2019069)
    Nandakishore S Menon (IMT2019057)

### Abstract
In this mandate contribution, we have performed basic preprocessing and exploratory data analysis preceding text summarisation and title generation.

In [51]:
from os import listdir
from string import punctuation
punctuation+='\n'
import re
from bs4 import BeautifulSoup

# Loading Data

In [52]:
def load_doc(filename):
    file = open(filename, encoding='utf-8')
    text = file.read()
    file.close()
    return text

In [53]:
def split_story(doc):
    index = doc.find('@highlight')
    story, highlights = doc[:index], doc[index:].split('@highlight')
    highlights = [h.strip() for h in highlights if len(h) > 0]
    return story, highlights

In [54]:
def load_stories(directory):
    all_stories = list()
    for name in listdir(directory):
        filename = directory + '/' + name
        doc = load_doc(filename)
        story, highlights = split_story(doc)
        all_stories.append({'story':story, 'highlights':highlights})
    return all_stories

In [55]:
directory = '../input/cnn-dailymail/cnn/stories'
data = load_stories(directory)
print('Loaded Stories %d' % len(data))

In [56]:
stories = [story['story'] for story in data]

In [61]:
# data
len(stories)

# Lexical Processing

In [78]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import FreqDist
nltk.download('stopwords')

In [79]:
nltk.download('punkt')

In [80]:
punctuation

In [81]:
stopwords = set(stopwords.words('english'))

In [82]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

## Zipf Distribution
$$f(w)\cdot r(w)\approx z$$
Zipf distribution is plotted after the following preprocessing:
- Removing punctuation
- Removing stopwords
- Lower case

In [83]:
def remove_punctuation(text):
    punctuationfree= "".join([i for i in text if i not in punctuation])
    return punctuationfree

In [84]:
def zipf_distribution(document):
    words = word_tokenize(document)
    filtered_words = []
    for word in words:
        if word.lower() not in stopwords:
            if word.lower() not in punctuation:
                filtered_words.append((remove_punctuation(word)).lower())

    word_frequencies = FreqDist(filtered_words)
    print(word_frequencies)
    labels = [element[0] for element in word_frequencies.most_common(30)]
    counts = [element[1] for element in word_frequencies.most_common(30)]
    plt.figure(figsize=(15,5))
    plt.title("Zipf distribution")
    plt.ylabel("Count")
    plt.xlabel("Word")
    plot = sns.barplot(labels,counts)
    return plot

In [85]:
zipf_distribution(stories[0])

## Removing URL, HTML, brackets, digits

In [86]:
def remove_url(data):
    return [re.sub(r'https://','', sentence) for sentence in data]
def remove_html(data):
    return [BeautifulSoup(sentence, 'html.parser').get_text() for sentence in data]
def remove_bracket(data):
    return [re.sub(r'[\([{})\]]','', sentence) for sentence in data]
def remove_digit(data):
    return [re.sub('[0-9]','', sentence) for sentence in data]
def remove_underscore(data):
    return [sentence.replace("_","") for sentence in data]

## Expanding Contractions
Eg: he's -> he is

In [87]:
!pip install contractions
import contractions

In [88]:
def expand_contractions(sentence):
    contractions_expanded = [contractions.fix(word) for word in sentence.split()]
    return ' '.join(contractions_expanded) 

## Tokenize
Splits the document into an array of words

In [89]:
def tokenize(document):
    return word_tokenize(document)

## Converting to lower case, removing punctuations and stopwords

In [90]:
def lower_case(tokens):
    return [word.lower() for word in tokens]
def remove_punctuation(tokens):
    return [re.sub(r'[^\w\s]', '', word) for word in tokens]
def remove_stopwords(tokens):
    return [word for word in tokens if word not in stopwords and word]

# Canonicalization

- Lemmatization 
- Stemming
- Soundex
- Edit distance

Getting the base form of each word. We chose lemmatization over stemming because stemming doesn’t give accurate results. Eg: for the word “scaling”, lemmatization gives “scale” while stemming gives “scal”.

Finding the base or root of a word is highly dependent on its part of speech. NLTK's word lemmatizer requires part of speech (POS) tag of a word, to lemmatize it.

In [91]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

In [92]:
def pos_tags(tokenized):
    return nltk.tag.pos_tag(tokenized)

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def to_wordnet(tokenized_pos):
    return [(word, get_wordnet_pos(pos_tag)) for (word, pos_tag) in tokenized_pos]

In [93]:
def lemmatizer(tokenized_stories):
    lemmatized_stories = []
    for story in tokenized_stories:
        tokenized_pos = pos_tags(story)
        tokenized_pos = to_wordnet(tokenized_pos)
        wnl = WordNetLemmatizer()
        lemmatized_stories.append([wnl.lemmatize(word, tag) for word, tag in tokenized_pos])
    return lemmatized_stories

In [94]:
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()

In [95]:
def stemming(text):
    return [porter_stemmer.stem(word) for word in text]

In [96]:
def get_soundex(word):
    word = word.upper()
    soundex = ""
    soundex += word[0]
    dictionary = {"BFPV": "1", "CGJKQSXZ":"2", "DT":"3", "L":"4", "MN":"5", "R":"6", "AEIOUHWY":"."}

    for char in word[1:]:
        for key in dictionary.keys():
            if char in key:
                code = dictionary[key]
                if code != soundex[-1]:
                    soundex += code

    soundex = soundex.replace(".", "")
    soundex = soundex[:4].ljust(4, "0")
    return soundex

In [97]:
def get_edit_distance(w1,w2):
    return nltk.edit_distance(w1, w2, transpositions=False)

# Combining into a pipeline

In [98]:
def process(stories):
    stories = remove_url(stories)
    stories = remove_html(stories)
    stories = remove_bracket(stories)
    stories = remove_digit(stories)
    stories = remove_underscore(stories)

    processed_list = []
    for story in stories:
        processed = expand_contractions(story)
        processed = tokenize(processed)
        processed = lower_case(processed)
        processed = remove_punctuation(processed)
        processed = remove_stopwords(processed)
        processed_list.append(processed)
    
    return lemmatizer(processed_list)

In [102]:
processed_stories = process(stories[:10])
processed_stories

# Word2Vec

In [100]:
import gensim
from gensim.models import Word2Vec

In [101]:
# model1 = gensim.models.Word2Vec(stories[0], min_count = 1,size = 100, window = 5, sg=0)
model = Word2Vec(processed_stories, min_count=1,vector_size= 50,workers=3, window =3, sg = 1)

# References
- [Soundex](https://amitg0161.medium.com/phonetic-hashing-and-soundex-in-python-60d4ca7a2843)
- [Loading CNN DailyMail data](https://github.com/santhoshkolloju/Cnn-Articles-Summarization-Pipeline-using-Dataset-API)