Imports

In [None]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from textblob import TextBlob
from wordcloud import WordCloud 
from cleantext import clean

Data pipeline

In [None]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

class Text_analysis():
    polarity=0
    subjectivity = 0
    helpfulness = 1
    tokens = []

    def __init__(self,txt):
        self.txt = txt
    
    def clean_emoji(self):
        self.txt = clean(self.txt,no_emoji=True)
        return

    def clean_text(self):
        self.txt = self.txt.lower()
        self.txt = re.sub(r'@[A-Za-z0-9]+','',self.txt) #remove mentions
        self.txt = re.sub(r'#','',self.txt) #remove hastags
        self.txt = re.sub(r'RT[\s]+','',self.txt) #removing RT
        self.txt = re.sub(r'https?:\/\/\s+','',self.txt) #removing the hyperlink
        return
    
    def get_polarity(self):
        self.polarity = TextBlob(self.txt).sentiment.polarity
        return
    
    def get_subjectivity(self):
        self.subjectivity = TextBlob(self.txt).sentiment.subjectivity 
        return
    
    def tokens(self):
        self.tokens = word_tokenize(self.txt)
        return
    
    def Stemming(self):
        ps = PorterStemmer()
        for i in range(len(self.tokens)):
            self.tokens[i] = ps.stem(self.tokens[i])
        self.txt = " ".join(self.tokens)
        return
    
    def Lemmatization(self):
        lm = WordNetLemmatizer()
        for i in range(len(self.tokens)):
            self.tokens[i] = lm.lemmatize(self.tokens[i])
        self.txt = " ".join(self.tokens)
        return
    
    def get_data():
        return [txt, polarity, subjectivity, tokens, helpfulness]

    def Pipeline(self):
        self.clean_emoji()
        self.clean_text()

        self.tokens()
        self.lemmatization()

        self.get_polarity()
        self.get_subjectivity()

        return self.get_data()



Vocabulary

In [None]:
class Vocabulary():
    PAD_token = 0   # Used for padding short sentences
    SOS_token = 1   # Start-of-sentence token
    EOS_token = 2   # End-of-sentence token

    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3
        self.num_sentences = 0
        self.longest_sentence = 0

    def add_word(self, word):
        if word not in self.word2index:
            # First entry of word into vocabulary
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            # Word exists; increase word count
            self.word2count[word] += 1
            
    def add_sentence(self, sentence):
        sentence_len = 0
        for word in sentence.split(' '):
            sentence_len += 1
            self.add_word(word)
        if sentence_len > self.longest_sentence:
            # This is the longest sentence
            self.longest_sentence = sentence_len
        # Count the number of sentences
        self.num_sentences += 1

    def to_word(self, index):
        return self.index2word[index]

    def to_index(self, word):
        return self.word2index[word]

Text Rank

In [None]:
from collections import OrderedDict
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.load('en_core_web_sm')

class TextRank4Keyword():
    """Extract keywords from text"""
    
    def __init__(self):
        self.d = 0.85 # damping coefficient, usually is .85
        self.min_diff = 1e-5 # convergence threshold
        self.steps = 10 # iteration steps
        self.node_weight = None # save keywords and its weight

    
    def set_stopwords(self, stopwords):  
        """Set stop words"""
        for word in STOP_WORDS.union(set(stopwords)):
            lexeme = nlp.vocab[word]
            lexeme.is_stop = True
    
    def sentence_segment(self, doc, candidate_pos, lower):
        """Store those words only in cadidate_pos"""
        sentences = []
        for sent in doc.sents:
            selected_words = []
            for token in sent:
                # Store words only with cadidate POS tag
                if token.pos_ in candidate_pos and token.is_stop is False:
                    if lower is True:
                        selected_words.append(token.text.lower())
                    else:
                        selected_words.append(token.text)
            sentences.append(selected_words)
        return sentences
        
    def get_vocab(self, sentences):
        """Get all tokens"""
        vocab = OrderedDict()
        i = 0
        for sentence in sentences:
            for word in sentence:
                if word not in vocab:
                    vocab[word] = i
                    i += 1
        return vocab
    
    def get_token_pairs(self, window_size, sentences):
        """Build token_pairs from windows in sentences"""
        token_pairs = list()
        for sentence in sentences:
            for i, word in enumerate(sentence):
                for j in range(i+1, i+window_size):
                    if j >= len(sentence):
                        break
                    pair = (word, sentence[j])
                    if pair not in token_pairs:
                        token_pairs.append(pair)
        return token_pairs
    def symmetrize(self, a):
        return a + a.T - np.diag(a.diagonal())
    
    def get_matrix(self, vocab, token_pairs):
        """Get normalized matrix"""
        # Build matrix
        vocab_size = len(vocab)
        g = np.zeros((vocab_size, vocab_size), dtype='float')
        for word1, word2 in token_pairs:
            i, j = vocab[word1], vocab[word2]
            g[i][j] = 1
            
        # Get Symmeric matrix
        g = self.symmetrize(g)
        
        # Normalize matrix by column
        norm = np.sum(g, axis=0)
        g_norm = np.divide(g, norm, where=norm!=0) # this is ignore the 0 element in norm
        
        return g_norm

    
    def get_keywords(self, number=10):
        """Print top number keywords"""
        ls = []
        node_weight = OrderedDict(sorted(self.node_weight.items(), key=lambda t: t[1], reverse=True))
        for i, (key, value) in enumerate(node_weight.items()):
            ls.append(key + ' - ' + str(value))
            if i > number:
                break
        return ls
    def analyze(self, text, 
                candidate_pos=['NOUN', 'PROPN'], 
                window_size=4, lower=False, stopwords=list()):
        """Main function to analyze text"""
        
        # Set stop words
        self.set_stopwords(stopwords)
        
        # Pare text by spaCy
        doc = nlp(text)
        
        # Filter sentences
        sentences = self.sentence_segment(doc, candidate_pos, lower) # list of list of words
        
        # Build vocabulary
        vocab = self.get_vocab(sentences)
        
        # Get token_pairs from windows
        token_pairs = self.get_token_pairs(window_size, sentences)
        
        # Get normalized matrix
        g = self.get_matrix(vocab, token_pairs)
        pr = np.array([1] * len(vocab))
        
        # Iteration
        previous_pr = 0
        for epoch in range(self.steps):
            pr = (1-self.d) + self.d * np.dot(g, pr)
            if abs(previous_pr - sum(pr))  < self.min_diff:
                break
            else:
                previous_pr = sum(pr)

        # Get weight for each node
        node_weight = dict()
        for word, index in vocab.items():
            node_weight[word] = pr[index]
        
        self.node_weight = node_weight

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

def Cosine_Similarity():
    Tfidf_vect = TfidfVectorizer()
    vector_matrix = Tfidf_vect.fit_transform(data)

    return cosine_similarity_matrix = cosine_similarity(vector_matrix)
    


Trending Score

In [None]:
data = []

for i in range(len(mobile)):

    view_count = int((viewframe.loc[viewframe['Keyword']==mobile[i],['view_count']].sum())/(len(viewframe['Mobile']==mobile[i])))

    like_count = int((viewframe.loc[viewframe['Keyword']==mobile[i],['like_count']].sum())/(len(viewframe['Mobile']==mobile[i])))

    comment_count = int((viewframe.loc[viewframe['Keyword']==mobile[i],['comment_count']].sum())/(len(viewframe['Mobile']==mobile[i])))

    pol = float((commframe.loc[commframe['Keyword']==mobile[i],['Polarity']].sum())/(len(commframe['Mobile']==mobile[i])))

    subj = float((commframe.loc[commframe['Keyword']==mobile[i],['Subjectivity']].sum())/(len(commframe['Mobile']==mobile[i])))

    sent = float((commframe.loc[commframe['Keyword']==mobile[i],['Sentiment']].sum())/(len(commframe['Mobile']==mobile[i])))

    data.append([mobile[i],view_count,like_count,comment_count,pol,subj,sent])

In [None]:
df['Trend_score'] = df['view_count']*df['like_count']*df['Avg_sentiment']