## Topic Modelling - Data cleaning

We have downloaded some speeches by B. Obama and M. Romney made during US Presidential election 2012. The speeches are uploaded on GitHUB and we would try to find some imortant topics each of their speeches consisted of. We will do Topic modelling using Gensim and visualize these topics using a wonderful package called pyLDAvis.

In this notebook - we will concentrate on cleaning the text which is very important steps towards modelling.

In [5]:
# Find folders and text under a given path
import os

# NLTK package
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Spacy
import spacy

# Gensim
from gensim import corpora
from gensim import models



In [23]:
# Clean text

nlp = spacy.load('en')

# 1. Word Tokenize text
# 2. Ignore stopwords and non alpha characters, words less than 4 in length
# 3. conver to lower case
# 4. Lemmatize the words

def clean_text(text) :
    
    # Clean text
    doc = nlp(text)
    
    # Stop words set
    #ignorewords = set(stopwords.words("english"))

    # Lemmatizer
    #lemmatizer = WordNetLemmatizer().lemmatize

    # tokenize words after cleanup
    #tokenlist = [lemmatizer(word.lower()) for word in word_tokenize(text) \
    #                   if word not in ignorewords and word.isalpha() and len(word) > 4 ] 
    
    tokenlist = [word.lemma_ for word in doc       \
                             if not word.is_stop   \
                            and not word.is_punct  \
                            and word.is_alpha    ]
        
    return(tokenlist)



def clean_text_nltk(text) :
    
    # Clean text
        
    # Stop words set
    ignorewords = set(stopwords.words("english"))

    # Lemmatizer
    lemmatizer = WordNetLemmatizer().lemmatize

    # tokenize words after cleanup
    tokenlist = [lemmatizer(word.lower()) for word in word_tokenize(text) \
                       if word not in ignorewords and word.isalpha() and len(word) > 4 ] 
            
    return(tokenlist)

In [24]:
# Create corpus

# 1. Update/Add to Dictionary
# 2. Create Bag of Words

def convert_to_corpus(texts, dictionary) :
    dictionary.add_documents(texts)    
    corpus = [dictionary.doc2bow(text, allow_update=True) for text in texts]
    return(corpus, dictionary)

In [25]:
# Text document folder
folder = 'speeches'

# Directory to save dictionary and corpus
savedir = os.path.join(os.getcwd(),'tmp')
if not os.path.exists(savedir) :
    os.makedirs(savedir)

#define dictionary
dictionary = corpora.Dictionary()

# Read, clean and convert the speeches in to corpus. Save corpus and dictionary
for fld in  os.listdir(folder) :       
    texts = list()
    for file in os.listdir(folder + os.sep + fld) :
        txtfile = folder + os.sep + fld + os.sep + file        
        with open(txtfile, 'r',errors='ignore') as fobj :        
            texts.append(clean_text(fobj.read()))
    bigram = models.Phrases(texts, min_count=5, threshold=100)    
    bigram_mod = models.phrases.Phraser(bigram)    
    texts = [bigram_mod[text] for text in texts]    
    corpus, dictionary = convert_to_corpus(texts, dictionary)
    corpus_dir = os.path.join(savedir,fld)
    if not os.path.exists(corpus_dir) :
        os.makedirs(corpus_dir)
    # Save the corpus
    corpora.MmCorpus.serialize(corpus_dir + os.sep + 'corpus.mm',corpus)
    

# Store the dictionary
dictionary.save(savedir + '/corpus_dict.dict')