# Importing Libraries

In [None]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')


In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# 1. Read the .csv file using Pandas. Take a look at the top few records

In [None]:
#Read the .csv file using Pandas. Take a look at the top few records.
ReviewData = pd.read_csv('K8 Reviews v0.2.csv')
ReviewData.head()

# 2. Normalize casings for the review text and extract the text into a list for easier manipulation.

In [None]:
def Normalize(reviews):
    NormalizeReviews = []
    for review in reviews:
        NormalizeReviews.append(review.lower())
    return NormalizeReviews

In [None]:
#Normalize casings for the review text and extract the text into a list for easier manipulation.
NormalizeReviewText = Normalize(ReviewData['review'].values)
NormalizeReviewText

# 4. Perform parts-of-speech tagging on each sentence using the NLTK POS tagger.

# 5. For the topic model, we should  want to include only nouns.

    Find out all the POS tags that correspond to nouns.

    Limit the data to only terms with these tags.

In [None]:
def Tokenize_POS(reviews):
    TokenizeReviews = []
    for review in reviews:
        #review = nltk.word_tokenize(review)
        #TokenizeReviews.append(nltk.pos_tag(review))  
        for word,pos in nltk.pos_tag(nltk.word_tokenize(review)):
            if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS'):
                #review = lemmatizer.lemmatize(word)
                #print (word)
                TokenizeReviews.append(review)    
    return TokenizeReviews    

In [None]:
#Tokenize the reviews using NLTKs word_tokenize function.
#Perform parts-of-speech tagging on each sentence using the NLTK POS tagger.
TokenizeReviews = Tokenize_POS(NormalizeReviewText)
TokenizeReviews

# 6. Lemmatize. 

    Different forms of the terms need to be treated as one.
    No need to provide POS tag to lemmatizer for now.

# 7. Remove stopwords and punctuation (if there are any). 

In [None]:
# function to remove Stopwords
def Remove_Stopwords(word_list, lang='english'):
    """Function removes english stopwords
    Args:
        word_list  : list of words
    Return:
        The return value. List of words
    """
    content = []
    stopwords_list = stopwords.words(lang)
    #print(type(word_list))
    #for word in word_list:
    #    print(word)
    #    if word.lower() not in stopwords_list:
    #        content.append(word)
    content = [w for w in word_list if w.lower() not in stopwords_list]
    #print(content)
    return content
            

In [None]:
# function to remove punctuation
def Simplify_Punctuation(text):
    """
    This function simplifies doubled or more complex punctuation. The exception is '...'.
    """
    corrected = str(text)
    corrected = re.sub(r'([!?,;])\1+', r'\1', corrected)
    corrected = re.sub(r'\.{2,}', r'...', corrected)
    return corrected

In [None]:
# function to lemmatize using WordNetLemmatizer
def Lemmatize_WordNet(words_list):
    wnl = WordNetLemmatizer()
    encoded_list = []
    for word in words_list:
        encoded_list.append(wnl.lemmatize(word, pos="v"))#.encode("utf8"))
    #print(encoded_list)
    return encoded_list

In [None]:
def tokenize(txt):
    """Function computes Tokenizes into sentences, strips punctuation/abbr, 
       converts to lowercase and tokenizes words
    Args:
        txt  : text documents
    Return:
        The return value. Tokenized words
    """
    return [word_tokenize(" ".join(re.findall(r'\w+', t,flags = re.UNICODE )).lower()) 
                for t in sent_tokenize(txt.replace("'", ""))]

In [None]:
def Apply_Stopwords_punctuation_lemmatize(reviews):
    PreprocessReviews = []
    for review in reviews:
        lemmetized = []
        review = Simplify_Punctuation(review)  # Remove Punctuation        
        sentences = tokenize(review)
        for sentence in sentences:
            words = Remove_Stopwords(sentence)         # Remove Stopwords
            words = Lemmatize_WordNet(words)           # lemmatize 
            # lets's skip short sentences with less than 3 words
            if len(words) < 3:
                continue
            lemmetized.append(" ".join(words))
        PreprocessReviews.append(" ".join(lemmetized))
    return PreprocessReviews

Lemmatize. 
Different forms of the terms need to be treated as one.
No need to provide POS tag to lemmatizer for now.
Remove stopwords and punctuation (if there are any). 

In [None]:
PreProcessReviews = Apply_Stopwords_punctuation_lemmatize(TokenizeReviews)
PreProcessReviews

# 8. Create a topic model using LDA on the cleaned-up data with 12 topics.

    Print out the top terms for each topic.
    What is the coherence of the model with the c_v metric?

In [None]:
TokenizeReviews = []
for review in PreProcessReviews:
    TokenizeReviews.append(nltk.word_tokenize(review)) 
#TokenizeReviews

In [None]:
# Create Dictionary

id2word = corpora.Dictionary(TokenizeReviews)

# Create Corpus
texts = TokenizeReviews

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])
print(id2word[0])

[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]


In [None]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=12, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
# Print the Keyword in the 12 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=TokenizeReviews, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

# Analyze the topics through the business lens.


Here are the possible topic headers

  0 - Possible Topic - Lenovo Note K8 (1)
  
  1 - Possible Topic - First Touch Phone (2)
  
  2 - Possible Topic - Charging Review (3)
  
  3 - Possible Topic - Review on sensor time (4)
  
  4 - Possible Topic - Positive Mobile Review (5) 
  
  5 - Possible Topic - Picture quality (6)
  
  6 - Possible Topic - Positive Review (5)
  
  7 - Possible Topic - Review on Processor (7)
  
  8 - Possible Topic - Positive Review (5)
  
  9 - Possible Topic - Negative Review (8)
  
  10 - Possible Topic - Review on Return policy (9)
  
  11 - Possible Topic - Review on software update (10)
  
  # Determine which of the topics can be combined.

  
  Distinct topics can be treated as 10

# 10 Create a topic model using LDA with what you think is the optimal number of topics

    What is the coherence of the model?

In [None]:
# Build LDA model with 8 topics
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
# Print the Keyword in the 8 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=TokenizeReviews, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

# The business should be able to interpret the topics.

    Name each of the identified topics.

    Create a table with the topic name and the top 10 terms in each to present to the business.


Here are possible topics and and top words for each topic 

(Topic 1: General Review, 

  Words: "heat" , "product" , "update" , "days" , 1" , "play" , "software" , "need" , "user" , "ok"
  ),
  
 (Topic 2: Review on Lenovo Note K8,
 
  Words: "lenovo" , "note" , "k8" , "first" , "u" , "previous" , "mobiles" , "still" , "face" , "office"
  ),
  
 (
  Topic 3: Review on Charging time ,
  
  Words: "work" , "use" , "charge" , "get" , "take" , "4" , "2" , "5" , "like" , "charger"
  ),
  
 (
  Topic 4: Review on Sensor time,
  
  Words: "time" , "bite" , "sensor" , "back" , "android" , "image" , "dedicate" , "stock" , "lot" , "music" 
  ),
  
 (
  Topic 5: Negative Review,
  
  Words: "phone" , "buy" , "dont" , "better" , "get" , "compare" , "one" , "worst" , "last" , "service"
  ),
  
 (
  Topic 6: Review on redmi ,
  
  Words: "poor" , "dual" , "much" , "make" , "life" , "8" , "purchase" , "provide" , "redmi" , "two"
  ),
  
 (
  Topic 7: Review on camera,
  
  Words: "good" , "camera" , "quality" , "issue" , "game" , "also" , "clarity" , "average" , "screen" , "light"
  ),
  
 (
  Topic 8: Review on network,
  
  Words: "doesnt" , "call" , "even" , "bad" , "network" , "many" , "cant" , "support" , "full" , "find"
  ),
 
 (
  Topic 9: Review on battery life,
  
  Words: "battery" , "feature" , "mode" , "fast" , "drain" , "great" , "speed" , "nice" , "device" , "really"
  ),
  
 (
   Topic 10: Review on price, 
   
   Words: "mobile" , "amazon" , "problem" , "price" , "awesome" , "hai" , "return" , "properly" , "best" , "hang"
  )

# By:Abdullah Alwabel