# Latent Dirichlet Allocation
- LDA
- topic modeling algorithm (statistical method)

## - Fetches the data names as 20newsgroup from sklearn dataset

In [1]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train', shuffle = True)
# newsgroups_test = fetch_20newsgroups(subset='test', shuffle = True)

## - Storing the fetched data into corpus_temp

In [2]:
corpus_temp = newsgroups_train.data

# - Manual pre-processing techniques
 *Commented out because it is too time consuming for large dataset*

In [3]:
# from nltk.corpus import stopwords
# from string import punctuation
# from gensim import corpora, models, similarities
# from nltk.stem import WordNetLemmatizer
# # from nltk.stem import SnowballStemmer

# # stemmer = SnowballStemmer('english')
# lemmatizer = WordNetLemmatizer()

# stoplist = stopwords.words('english')
# numbers_punc = [str(i) for i in range(10)] + list(punctuation)

# preprocessed_corpus = []
# i = 1
# for sentence in corpus:
#     word_list = sentence.lower().split()
#     temp = []
#     for word in word_list :
#         if word not in stoplist and len(word)>3:
#             letter_temp = []
#             for letter in word:
#                 if letter not in numbers_punc :
                    
#                         letter_temp.append(letter)
#             if len(letter_temp)!=0 :   
#                 temp.append("".join(letter_temp))
#             temp = [lemmatizer.lemmatize(word,pos='v') for word in temp]
#     preprocessed_corpus.append(temp)
#     print(i)
#     i+=1

# #initialize a dictionary (value,key)=(5,'elon') means word('elon') is repitated 5 times.
# dictionary = corpora.Dictionary(preprocessed_corpus)
# preprocessed_corpus[:5]

## Automatic pre-processing using Gensim Library
*Snowball Stemmer and Wordnet Lemmatizer from nltk library is used for stemming and lemmatization purpose. Stopwords are are removed and the length of words less than 3 are discarded.* 

In [4]:
'''
Write a function to perform the pre processing steps on the entire dataset
'''
import gensim
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
stemmer = SnowballStemmer('english')
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

# Tokenize and lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
            
    return result
    
processed_docs = []

for doc in newsgroups_train.data:
    processed_docs.append(preprocess(doc))

print(processed_docs[:2])

[['lerxst', 'thing', 'subject', 'nntp', 'post', 'host', 'organ', 'univers', 'maryland', 'colleg', 'park', 'line', 'wonder', 'enlighten', 'door', 'sport', 'look', 'late', 'earli', 'call', 'bricklin', 'door', 'small', 'addit', 'bumper', 'separ', 'rest', 'bodi', 'know', 'tellm', 'model', 'engin', 'spec', 'year', 'product', 'histori', 'info', 'funki', 'look', 'mail', 'thank', 'bring', 'neighborhood', 'lerxst'], ['guykuo', 'carson', 'washington', 'subject', 'clock', 'poll', 'final', 'summari', 'final', 'clock', 'report', 'keyword', 'acceler', 'clock', 'upgrad', 'articl', 'shelley', 'qvfo', 'innc', 'organ', 'univers', 'washington', 'line', 'nntp', 'post', 'host', 'carson', 'washington', 'fair', 'number', 'brave', 'soul', 'upgrad', 'clock', 'oscil', 'share', 'experi', 'poll', 'send', 'brief', 'messag', 'detail', 'experi', 'procedur', 'speed', 'attain', 'rat', 'speed', 'card', 'adapt', 'heat', 'sink', 'hour', 'usag', 'floppi', 'disk', 'function', 'floppi', 'especi', 'request', 'summar', 'day',

## - Creating the dictionary and Filtering the extremes
Creating the dictionary: We create the dictionary of (v:k) pair where v is the number of repitition of words and k is the word.

Filtering the extremese: This is an optional step that removes the extreme less and extreme high frequent words.

In [5]:
'''
OPTIONAL STEP
Remove very rare and very common words:

- words appearing less than 15 times
- words appearing in more than 10% of all documents
dictionary = gensim.corpora.Dictionary(processed_docs)
'''
dictionary = gensim.corpora.Dictionary(processed_docs)
dictionary.filter_extremes(no_below=15, no_above=0.1, keep_n= 100000)

## - Document to BOW(Bag Of Words):
We create (token_id, token_count) form for each document using doc2bow.

In [6]:
# for each document, (token_id,token_count) is maintained
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

## Creating LDA Model
The common parameters used in LDA are:
- num_topics: number of latent topics to be extracted from the corpus.
- passes: number of training passes through the corpus
- workers: number of extra processes to use for parallelization. Uses all available cores by default.
- id2word: mapping from word ids (integers) to words (strings).

In [7]:
from gensim import models
lda_model = models.LdaMulticore(bow_corpus,
                            num_topics = 8,
                            id2word = dictionary,
                            passes = 20,
                            workers = 2)

## Printing words occuring in the topic and its relative weight:
For each topic, we will explore the words occuring in that topic and its relative weight.

In [8]:
lda_model.print_topics()

[(0,
  '0.007*"presid" + 0.006*"american" + 0.005*"govern" + 0.004*"clinton" + 0.004*"money" + 0.004*"nation" + 0.004*"health" + 0.004*"cleveland" + 0.003*"talk" + 0.003*"happen"'),
 (1,
  '0.015*"game" + 0.013*"team" + 0.010*"space" + 0.009*"play" + 0.009*"nasa" + 0.008*"player" + 0.006*"hockey" + 0.005*"season" + 0.005*"toronto" + 0.005*"orbit"'),
 (2,
  '0.010*"armenian" + 0.008*"israel" + 0.007*"kill" + 0.007*"isra" + 0.006*"govern" + 0.006*"turkish" + 0.005*"jew" + 0.005*"weapon" + 0.005*"arab" + 0.004*"crime"'),
 (3,
  '0.011*"christian" + 0.007*"jesus" + 0.006*"exist" + 0.005*"moral" + 0.004*"bibl" + 0.004*"word" + 0.004*"religion" + 0.004*"life" + 0.004*"church" + 0.004*"evid"'),
 (4,
  '0.008*"drive" + 0.006*"bike" + 0.006*"power" + 0.005*"wire" + 0.005*"engin" + 0.004*"car" + 0.004*"light" + 0.004*"speed" + 0.003*"turn" + 0.003*"littl"'),
 (5,
  '0.022*"window" + 0.020*"file" + 0.011*"program" + 0.010*"drive" + 0.008*"card" + 0.007*"scsi" + 0.007*"version" + 0.007*"disk" + 0.

## Interpreting results:
- Topic 0: Politics
    It contains word like president, american, govern, clinton,... This might be Politics.
- Topic 1: Sports
    It contains word like game, team, play, player,... This might be Sports.
- Topic 2: Crime and Violence
    It contains word like kill, weapon, crime,... This might be Violence.
- Topic 3: Religion
    It contains word like christian, jesus, bible, religion,... This might be Religion.
- Topic 4: Automobile
    It contains word like drive, bike, engine, car,... This might be Automobile.
- Topic 5: Graphic Cards
    It contains word like file, program, drive, card, version, driver,... This might be Graphic Cards. 
- Topic 6: Security
    It contains word like encrypt, chip, secure, clipper, key, privacy,... This might be Security.
- Topic 7: Technology
    It contains words like software, image, data, program,... This might be Technology.