<a href="https://colab.research.google.com/github/NouamaneTazi/NLP/blob/master/topic_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('gutenberg')
nltk.download('brown')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


True

In [0]:
import nltk
import re
import html
import string
from pprint import pprint
from bs4 import BeautifulSoup

# Data Preprocessing and Normalization

## Tokenizing


In [0]:
corpus = ["The brown fox wasn't that quick and he couldn't win the race",
          "Hey that's a great deal! I just bought a phone for $199",
          "@@You'll (learn) a **lot** in the book. Python is an amazing language!@@"]


def tokenize_sentence(text): # tokenize every sentence in list
    sentences = nltk.sent_tokenize(text)
    word_tokens = [nltk.word_tokenize(sentence) for sentence in sentences] 
    return word_tokens

#[word_tokenize(text) for text in corpus]

## HTML Stripping

In [0]:
def remove_html(text):
  soup= BeautifulSoup(text, 'lxml')
  html_free = soup.get_text()
  return html_free
  
remove_html('&pound;682m')

'£682m'

## Removing accented characters

In [0]:
import unicodedata
def remove_accented_chars(text):
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
remove_accented_chars('Sómě Áccěntěd těxt')

'Some Accented text'

## Removing digits

In [0]:
import re
re.sub(r'\d+', '', '123hello 456world')

'hello world'

## Removing special characters

In [0]:
corpus[0]

"The brown fox wasn't that quick and he couldn't win the race"

In [0]:
from nltk.tokenize import RegexpTokenizer

def remove_special_characters(text):
    tokenizer = RegexpTokenizer(r'\w+')
    return ' '.join(tokenizer.tokenize(text))

remove_special_characters("The brown fox wasn't that quick and he couldn't win the race")

'The brown fox wasn t that quick and he couldn t win the race'

## Expanding contractions: I'd > I would

In [0]:
from contractions import CONTRACTION_MAP
import re

def expand_contractions(sentence, contraction_mapping):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_sentence = contractions_pattern.sub(expand_match, sentence)
    return expanded_sentence
    
expand_contractions("The brown fox wasn't that quick and he couldn't win the race", CONTRACTION_MAP)

'The brown fox was not that quick and he could not win the race'

## Removing stopwords (a, the...)

In [0]:
def remove_stopwords(expanded_text):
    stopword_list = nltk.corpus.stopwords.words('english')
    return ' '.join([token for token in word_tokenize(expanded_text) if token not in stopword_list])
    
remove_stopwords('The brown fox was not that quick and he could not win the race') 

'The brown fox quick could win race'

## Removing repeated characters

In [0]:
from nltk.corpus import wordnet
from nltk import word_tokenize

def remove_repeated_characters(text):
    tokens = word_tokenize(text)
    repeat_pattern = re.compile(r'(\w*)(\w)\2(\w*)')
    match_substitution = r'\1\2\3'
    def replace(old_word):
        if wordnet.synsets(old_word):
            return old_word
        new_word = repeat_pattern.sub(match_substitution, old_word)
        return replace(new_word) if new_word != old_word else new_word
            
    correct_tokens = [replace(word) for word in tokens]
    return ' '.join(correct_tokens)

print (remove_repeated_characters("My schooool is realllllyyy amaaazingggg"))    


My school is really amazing


## Stemming (root stem)

In [0]:
# porter stemmer
from nltk.stem import PorterStemmer
ps = PorterStemmer()
print (ps.stem('jumping'), ps.stem('jumps'), ps.stem('jumped'))
print (ps.stem('lying'))
print (ps.stem('strange'))

jump jump jump
lie
strang


In [0]:
# lancaster stemmer
from nltk.stem import LancasterStemmer
ls = LancasterStemmer()
print (ls.stem('jumping'), ls.stem('jumps'), ls.stem('jumped'))
print (ls.stem('lying'))
print (ls.stem('strange'))

jump jump jump
lying
strange


In [0]:
# regex stemmer
from nltk.stem import RegexpStemmer
rs = RegexpStemmer('ing$|s$|ed$', min=4)
print (rs.stem('jumping'), rs.stem('jumps'), rs.stem('jumped'))
print (rs.stem('lying'))
print (rs.stem('strange'))

jump jump jump
ly
strange


In [0]:
# snowball stemmer
from nltk.stem import SnowballStemmer
ss = SnowballStemmer("english")
print ('Supported Languages:', SnowballStemmer.languages)
print (ss.stem('jumping'), ss.stem('jumps'), ss.stem('jumped'))
print (ss.stem('lying'))
print (ss.stem('strange'))

Supported Languages: ('arabic', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'hungarian', 'italian', 'norwegian', 'porter', 'portuguese', 'romanian', 'russian', 'spanish', 'swedish')
jump jump jump
lie
strang


In [0]:
from nltk.stem import SnowballStemmer

def stem_text(text):
    ss = SnowballStemmer("english")
    return ' '.join([ss.stem(token) for token in word_tokenize(text)])

## Lemmatization (root word)

In [0]:
from nltk.stem import WordNetLemmatizer

wnl = WordNetLemmatizer()

# lemmatize nouns
print (wnl.lemmatize('cars', 'n'))
print (wnl.lemmatize('men', 'n'))

# lemmatize verbs
print(wnl.lemmatize('running', 'v'))
print (wnl.lemmatize('ate', 'v'))

# lemmatize adjectives
print (wnl.lemmatize('saddest', 'a'))
print (wnl.lemmatize('fancier', 'a'))

# ineffective lemmatization
print (wnl.lemmatize('ate', 'n'))
print (wnl.lemmatize('fancier', 'v'))

car
men
run
eat
sad
fancy
ate
fancier


"the brown fox was n't that quicker and he could n't win the race lie jump !"

In [0]:
# !pip install spacy
# import spacy
import en_core_web_sm
nlp = en_core_web_sm.load(parse=True, tag=True, entity=True)
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text
print("Lemmatization :")
print(lemmatize_text("The brown fox wasn't that quicker and he couldn't win the race lying jumps!"))
print("Stemming :")
print(stem_text("The brown fox wasn't that quicker and he couldn't win the race lying jumps!"))

Lemmatization :
the brown fox be not that quick and he could not win the race lie jump !
Stemming :
the brown fox was n't that quicker and he could n't win the race lie jump !


In [0]:
def normalize_corpus(corpus, html_stripping=True,
contraction_expansion=True, accented_char_removal=True,
repeated_char_removal=True, text_lemmatization=True,
text_stemming=False, special_char_removal=True,
remove_digits=True, stopword_removal=True,
stopwords=None, tokenize=False): 
    normalized_corpus = []  
    for text in corpus:
        if html_stripping:
            text = remove_html(text)
        text = text.lower()
        if accented_char_removal:
            text = remove_accented_chars(text)
        if contraction_expansion:
            text = expand_contractions(text, CONTRACTION_MAP)            
        if special_char_removal:
            text = remove_special_characters(text)
        if repeated_char_removal:
            text = remove_repeated_characters(text) # better remove special chars
        if text_lemmatization:
            text = lemmatize_text(text) 
        if text_stemming:
            text = stem_text(text)
        if remove_digits:
            text = re.sub(r'\d+', '', text)        
        if stopword_removal:
            text = remove_stopwords(text) # better be expanded and lower_cased before
        # remove extra whitespace and newlines
        text = re.sub(' +', ' ', text)
        text = re.sub(r'[\r|\n|\r\n]+', ' ', text) 
        if tokenize:
            text = word_tokenize(text)
            text = list(filter(None, text))
        normalized_corpus.append(text)
    return normalized_corpus

In [0]:
sample_text="""US unveils world's most powerful supercomputer, beats
China. The US has unveiled the world's most powerful supercomputer called
'Summit', beating the previous record-holder China's Sunway TaihuLight.
With a peak performance of 200,000 trillion calculations per second,
it is over twice as fast as Sunway TaihuLight, which is capable of
93,000 trillion calculations per second. Summit has 4,608 servers, which
reportedly take up the size of two tennis courts."""

print(normalize_corpus([sample_text],contraction_expansion=False))

['us unveil world powerful supercomputer beat china us unveil world powerful supercomputer call summit beat previous record holder china sunway taihulight peak performance trillion calculation per second twice fast sunway taihulight capable trillion calculation per second summit server reportedly take size two tennis court']


# Keyphrase Extraction


In [0]:
from nltk.corpus import gutenberg
from operator import itemgetter
# load corpus
alice = gutenberg.sents(fileids='carroll-alice.txt')
alice = [' '.join(ts) for ts in alice]


In [0]:
norm_alice = list(filter(None, normalize_corpus(alice, text_lemmatization=False)))
# print and compare first line
print(alice[0], '\n', norm_alice[0])

[ Alice ' s Adventures in Wonderland by Lewis Carroll 1865 ] 
 alice adventures wonderland lewis carroll


## Compute n-grams

In [0]:
from nltk.collocations import BigramCollocationFinder
from nltk.collocations import BigramAssocMeasures
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_documents([item.split() for item in norm_alice])
finder.nbest(bigram_measures.raw_freq, 10) 

[('said', 'alice'),
 ('mock', 'turtle'),
 ('march', 'hare'),
 ('said', 'king'),
 ('thought', 'alice'),
 ('said', 'hatter'),
 ('white', 'rabbit'),
 ('said', 'mock'),
 ('said', 'caterpillar'),
 ('said', 'gryphon')]

In [0]:
finder.nbest(bigram_measures.pmi, 10)

[('abide', 'figures'),
 ('acceptance', 'elegant'),
 ('accounting', 'tastes'),
 ('accustomed', 'usurpation'),
 ('act', 'crawling'),
 ('adjourn', 'immediate'),
 ('adoption', 'energetic'),
 ('affair', 'trusts'),
 ('agony', 'terror'),
 ('alarmed', 'proposal')]

# Feature Engineering

  ### Bag of Words Model


In [0]:
from sklearn.feature_extraction.text import CountVectorizer
# get bag of words features in sparse format
cv = CountVectorizer(min_df=0., max_df=1.)
cv_matrix = cv.fit_transform(norm_corpus)
cv_matrix

NameError: ignored

# Topic modeling methods
 ## LDA Analysis
 

In [0]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk