In [656]:
import pandas as pd
import matplotlib.pylab as plt
import string                            #导入字符串模块

from itertools import chain

# 数据预处理

In [657]:
tab1 = "./hair_dryer.tsv"

df = pd.read_csv(tab1, sep='\t', header=0)

## 删除无效信息

In [658]:
df = df[~(df['vine'].str.contains("N") & df['verified_purchase'].str.contains("N"))]

In [659]:
# df = df[(df['total_votes'] > 0) & (df['helpful_votes'] > 0)]

In [660]:
df = df.groupby('product_parent').filter(lambda x: len(x) > 1)

In [661]:
print(len(df))

9754


# 整体文本分析

In [662]:
pattern = r"\&\#[0-9]+\;"

df["preprocessed"] = df["review_body"].str.replace(pat=pattern, repl="", regex=True)

In [663]:
import re
import nltk

from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
from nltk.corpus import wordnet

#import nltk resources
resources = ["wordnet", "stopwords", "punkt", \
             "averaged_perceptron_tagger", "maxent_treebank_pos_tagger"]

for resource in resources:
    try:
        nltk.data.find("tokenizers/" + resource)
    except LookupError:
        nltk.download(resource)

#create Lemmatizer object
lemma = WordNetLemmatizer()

def lemmatize_word(tagged_token):
    """ Returns lemmatized word given its tag"""
    root = []
    for token in tagged_token:
        tag = token[1][0]
        word = token[0]
        if tag.startswith('J'):
            root.append(lemma.lemmatize(word, wordnet.ADJ))
        elif tag.startswith('V'):
            root.append(lemma.lemmatize(word, wordnet.VERB))
        elif tag.startswith('N'):
            root.append(lemma.lemmatize(word, wordnet.NOUN))
        elif tag.startswith('R'):
            root.append(lemma.lemmatize(word, wordnet.ADV))
        else:          
            root.append(word)
    return root




[nltk_data] Downloading package wordnet to /home/alphonse/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/alphonse/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/alphonse/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_treebank_pos_tagger to
[nltk_data]     /home/alphonse/nltk_data...
[nltk_data]   Package maxent_treebank_pos_tagger is already up-to-
[nltk_data]       date!


In [664]:
def lemmatize_doc(document):
    """ Tags words then returns sentence with lemmatized words"""
    lemmatized_list = []
    tokenized_sent = sent_tokenize(document)
    for sentence in tokenized_sent:
        no_punctuation = re.sub(r"[`'\",.!?()]", " ", sentence)
        tokenized_word = word_tokenize(no_punctuation)
        tagged_token = pos_tag(tokenized_word)
        lemmatized = lemmatize_word(tagged_token)
        lemmatized_list.extend(lemmatized)
    return " ".join(lemmatized_list)

In [665]:
#apply our functions
df["preprocessed"] = df["preprocessed"].apply(lambda row: lemmatize_doc(row))

print(df["preprocessed"].iloc[45])

Very powerful Dries my hair quickly


In [666]:
from unicodedata import normalize

remove_accent = lambda text: normalize("NFKD", text).encode("ascii", "ignore").decode("utf-8", "ignore")

df["preprocessed"] = df["preprocessed"].apply(remove_accent)

print(df["preprocessed"].iloc[465])

For the price it be not bad


In [667]:
vocab_lenpattern = r"[^\w\s]"

df["preprocessed"] = df["preprocessed"].str.replace(pat=pattern, repl=" ", regex=True)

print(df["preprocessed"].iloc[465])

For the price it be not bad


In [668]:
df["preprocessed"] = df["preprocessed"].str.lower()

print(df["preprocessed"].iloc[465])

for the price it be not bad


In [669]:
# from nltk.corpus import stopwords

# stop_words = stopwords.words("english")

# stop_words = [word.replace("\'", "") for word in stop_words]

# print(f"sample stop words: {stop_words[:15]} \n")

# remove_stop_words = lambda row: " ".join([token for token in row.split(" ") \
#                                           if token not in stop_words])
# df["preprocessed"] = df["preprocessed"].apply(remove_stop_words)

# print(df["preprocessed"].iloc[465])

In [670]:
pattern = r"[\s]+"

df["preprocessed"] = df["preprocessed"].str.replace(pat=pattern, repl=" ", regex=True)

print(df["preprocessed"].iloc[465])

for the price it be not bad


In [671]:
corpora = df["preprocessed"].values
tokenized = [corpus.split(" ") for corpus in corpora]

for review in tokenized:
    while '' in review:
        review.remove('')

# text = list(chain(*tokenized))

text = []
for i in range(0, int(len(df) / 5)):
    text = text + tokenized[i]


print(text)


['works', 'great', 'this', 'dry', 'my', 'hair', 'faster', 'that', 'big', 'more', 'powerful', 'model', 'i', 'love', 'travel', 'blow', 'dryer', 'because', 'they', 'be', 'easy', 'to', 'lift', 'and', 'they', 'usually', 'come', 'in', '1600', 'w', 'or', 'less', 'bigger', 'dryer', 'be', 'heavy', 'and', 'blow', 'my', 'hair', 'everywhere', 'this', 'have', 'a', 'surprising', 'amount', 'of', 'power', 'and', 'be', 'very', 'compact', 'i', 'would', 'give', 'it', 'a', 'five', 'except', 'that', 'the', 'switch', 'be', 'not', 'easy', 'to', 'turn', 'on', 'and', 'off', 'with', 'one', 'hand', 'and', 'it', 's', 'noisy', 'than', 'i', 'anticipate', 'love', 'this', 'dryer', 'style', 'hair', 'in', 'style', 'excellent', 'dryer', 'i', 'find', 'everything', 'go', 'well', 'except', 'the', 'plug', 'why', 'the', 'left', 'and', 'right', 'be', 'opposite', 'and', 'i', 'have', 'to', 'put', 'the', 'plug', 'upside', 'down', 'for', 'charge', 'another', 'flaw', 'be', 'the', 'big', 'noise', 'perfect', 'i', 'really', 'like', '

### POS Tagging For Lemmatization

NLTK is again used for <b>POS tagging</b> the input text so that the words can be lemmatized based on their POS tags.

In [672]:
POS_tag = nltk.pos_tag(text)

print ("Tokenized Text with POS tags: \n")
print (POS_tag)

Tokenized Text with POS tags: 

[('works', 'VBZ'), ('great', 'JJ'), ('this', 'DT'), ('dry', 'JJ'), ('my', 'PRP$'), ('hair', 'NN'), ('faster', 'RBR'), ('that', 'WDT'), ('big', 'JJ'), ('more', 'JJR'), ('powerful', 'JJ'), ('model', 'NN'), ('i', 'NN'), ('love', 'VBP'), ('travel', 'NN'), ('blow', 'NN'), ('dryer', 'NN'), ('because', 'IN'), ('they', 'PRP'), ('be', 'VB'), ('easy', 'JJ'), ('to', 'TO'), ('lift', 'VB'), ('and', 'CC'), ('they', 'PRP'), ('usually', 'RB'), ('come', 'VBP'), ('in', 'IN'), ('1600', 'CD'), ('w', 'NN'), ('or', 'CC'), ('less', 'JJR'), ('bigger', 'JJR'), ('dryer', 'NN'), ('be', 'VB'), ('heavy', 'JJ'), ('and', 'CC'), ('blow', 'JJ'), ('my', 'PRP$'), ('hair', 'NN'), ('everywhere', 'RB'), ('this', 'DT'), ('have', 'VBP'), ('a', 'DT'), ('surprising', 'JJ'), ('amount', 'NN'), ('of', 'IN'), ('power', 'NN'), ('and', 'CC'), ('be', 'VB'), ('very', 'RB'), ('compact', 'JJ'), ('i', 'NN'), ('would', 'MD'), ('give', 'VB'), ('it', 'PRP'), ('a', 'DT'), ('five', 'CD'), ('except', 'IN'), ('th

### Lemmatization

The tokenized text (mainly the nouns and adjectives) is normalized by <b>lemmatization</b>.
In lemmatization different grammatical counterparts of a word will be replaced by single
basic lemma. For example, 'glasses' may be replaced by 'glass'. 

In [673]:
from nltk.stem import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()

adjective_tags = ['JJ','JJR','JJS']

lemmatized_text = []

for word in POS_tag:
    if word[1] in adjective_tags:
        lemmatized_text.append(str(wordnet_lemmatizer.lemmatize(word[0],pos="a")))
    else:
        lemmatized_text.append(str(wordnet_lemmatizer.lemmatize(word[0]))) #default POS = noun
        
print ("Text tokens after lemmatization of adjectives and nouns: \n")
print (lemmatized_text)

Text tokens after lemmatization of adjectives and nouns: 

['work', 'great', 'this', 'dry', 'my', 'hair', 'faster', 'that', 'big', 'more', 'powerful', 'model', 'i', 'love', 'travel', 'blow', 'dryer', 'because', 'they', 'be', 'easy', 'to', 'lift', 'and', 'they', 'usually', 'come', 'in', '1600', 'w', 'or', 'less', 'big', 'dryer', 'be', 'heavy', 'and', 'blow', 'my', 'hair', 'everywhere', 'this', 'have', 'a', 'surprising', 'amount', 'of', 'power', 'and', 'be', 'very', 'compact', 'i', 'would', 'give', 'it', 'a', 'five', 'except', 'that', 'the', 'switch', 'be', 'not', 'easy', 'to', 'turn', 'on', 'and', 'off', 'with', 'one', 'hand', 'and', 'it', 's', 'noisy', 'than', 'i', 'anticipate', 'love', 'this', 'dryer', 'style', 'hair', 'in', 'style', 'excellent', 'dryer', 'i', 'find', 'everything', 'go', 'well', 'except', 'the', 'plug', 'why', 'the', 'left', 'and', 'right', 'be', 'opposite', 'and', 'i', 'have', 'to', 'put', 'the', 'plug', 'upside', 'down', 'for', 'charge', 'another', 'flaw', 'be', 'th

### POS tagging for Filtering

The <b>lemmatized text</b> is <b>POS tagged</b> here. The tags will be used for filtering later on.

In [674]:
POS_tag = nltk.pos_tag(lemmatized_text)

print ("Lemmatized text with POS tags: \n")
print (POS_tag)

Lemmatized text with POS tags: 

[('work', 'NN'), ('great', 'JJ'), ('this', 'DT'), ('dry', 'JJ'), ('my', 'PRP$'), ('hair', 'NN'), ('faster', 'RBR'), ('that', 'WDT'), ('big', 'JJ'), ('more', 'JJR'), ('powerful', 'JJ'), ('model', 'NN'), ('i', 'NN'), ('love', 'VBP'), ('travel', 'NN'), ('blow', 'NN'), ('dryer', 'NN'), ('because', 'IN'), ('they', 'PRP'), ('be', 'VB'), ('easy', 'JJ'), ('to', 'TO'), ('lift', 'VB'), ('and', 'CC'), ('they', 'PRP'), ('usually', 'RB'), ('come', 'VBP'), ('in', 'IN'), ('1600', 'CD'), ('w', 'NN'), ('or', 'CC'), ('less', 'JJR'), ('big', 'JJ'), ('dryer', 'NN'), ('be', 'VB'), ('heavy', 'JJ'), ('and', 'CC'), ('blow', 'JJ'), ('my', 'PRP$'), ('hair', 'NN'), ('everywhere', 'RB'), ('this', 'DT'), ('have', 'VBP'), ('a', 'DT'), ('surprising', 'JJ'), ('amount', 'NN'), ('of', 'IN'), ('power', 'NN'), ('and', 'CC'), ('be', 'VB'), ('very', 'RB'), ('compact', 'JJ'), ('i', 'NN'), ('would', 'MD'), ('give', 'VB'), ('it', 'PRP'), ('a', 'DT'), ('five', 'CD'), ('except', 'IN'), ('that', 

## POS Based Filtering

Any word from the lemmatized text, which isn't a noun, adjective, or gerund (or a 'foreign word'), is here
considered as a <b>stopword</b> (non-content). This is based on the assumption that usually keywords are noun,
adjectives or gerunds. 

Punctuations are added to the stopword list too.

In [675]:
stopwords = []

# wanted_POS = ['NN','NNS','NNP','NNPS','JJ','JJR','JJS','VBG','FW'] 
wanted_POS = ['NN', 'NNS', 'NNP', 'NNPS']
for word in POS_tag:
    if word[1] not in wanted_POS:
        stopwords.append(word[0])

punctuations = list(str(string.punctuation))

stopwords = stopwords + punctuations

### Complete stopword generation

Even if we remove the aforementioned stopwords, still some extremely common nouns, adjectives or gerunds may
remain which are very bad candidates for being keywords (or part of it). 

An external file constituting a long list of stopwords is loaded and all the words are added with the previous
stopwords to create the final list 'stopwords-plus' which is then converted into a set. 

(Source of stopwords data: https://www.ranks.nl/stopwords)

Stopwords-plus constitute the sum total of all stopwords and potential phrase-delimiters. 

(The contents of this set will be later used to partition the lemmatized text into n-gram phrases. But, for now, I will simply remove the stopwords, and work with a 'bag-of-words' approach. I will be developing the graph using unigram texts as vertices)

In [676]:
stopword_file = open("long_stopwords.txt", "r")
#Source = https://www.ranks.nl/stopwords

lots_of_stopwords = []

for line in stopword_file.readlines():
    lots_of_stopwords.append(str(line.strip()))

stopwords_plus = []
stopwords_plus = stopwords + lots_of_stopwords
stopwords_plus = set(stopwords_plus)

#Stopwords_plus contain total set of all stopwords

### Removing Stopwords 

Removing stopwords from lemmatized_text. 
Processeced_text condtains the result.

In [677]:
processed_text = []
for word in lemmatized_text:
    if word not in stopwords_plus:
        processed_text.append(word)
print (processed_text)

['amount', 'power', 'hand', 'style', 'style', 'year', 'month', 'day', 'week', 'isnt', 'didnt', 'style', 'voltage', 'product', 'client', 'time', 'lot', 'power', 'month', 'price', 'morning', 'thing', 'hand', 'mousse', 'oil', 'struggle', 'groove', 'year', 'smoking', 'fire', 'button', 'yr', 'weekend', 'attempt', 'refund/repair', 'unit', 'company', 'space', 'bathroom', 'blowdryer', '8-10years', '*very', 'time', '10-12minutes', 'thing', 'texture', 'blowdry', 'time', 'week', 'job', 'job', 'lot', 'thing', 'job', 'buildup', 'ability', 'hand', 'hook', 'blowdryer', 'job', 'thing', 'comb', 'blowdry', 'amount', 'wife', 'time', 'money', 'stream', 'item', 'order', 'blowdryer', 'photo', 'charm', 'voltage', 'makeover', 'instance', 'replica', 'adjustment', 'fan', 'order', 'button', 'person', 'finger', 'button', 'sliding', 'bathroom', 'watt', 'power', 'controller', 'investment', 'time', 'dust', 'year', 'style', 'comb', 'frill', 'lot', 'fan', 'button', 'product', 'appointment', 'button', 'time', 'piece', 

## Vocabulary Creation
Vocabulary will only contain unique words from processed_text.

In [678]:
vocabulary = list(set(processed_text))
print (vocabulary)

['summer', 'health', 'problem', 'cast', 'strand', 'freakinglyyy', 'heart', 'year', 'druers', 'dey', 'fan', 'guard', 'rusk', 'force', 'sieve', 'appointment', 'momma', 'lint', 'energy', 'prodcut', 'sliding', 'record', 'availability', 'w8less', 'suggestion', 'afterward', 'number', 'minipro', 'fill', 'basis', 'jojoba', 'gripe', 'short-length', 'dog', 'fuss', 'print', 'comb', 'sign', 'space', 'regard', 'class', 'glory', 'practicality', 'digress', 'device', 'refund/repair', 'amount/use', 'miracle', 'favor', 'resolve', 'mode', 'bummer', 'spec', 'fond', 'tha', 'friziness', 'shin', 'gem', 'dreds', 'button', 'transaction', 'addition', 'pole', 'makeup', 'aim', 'opening', 'pillow', 'headband', 'fr', 'grinding', 'job', 'rite', 'gimmick', 'miniature', 'spring', 'movie', 'goody', 'selector', 'time', 'film', 'humidity', 'bleach', 'steamer', 'choice', 'rimini', 'mhd', 'rvdr5033', 'charm', '-independently-', 'hour', 'shelf', 'distribute', 'pic', 'breaking', 'twinturbo', 'cable', 'back-up', '4-stars', 'b

### Building Graph

TextRank is a graph based model, and thus it requires us to build a graph. Each words in the vocabulary will serve as a vertex for graph. The words will be represented in the vertices by their index in vocabulary list.  

The weighted_edge matrix contains the information of edge connections among all vertices.
I am building wieghted undirected edges.

weighted_edge[i][j] contains the weight of the connecting edge between the word vertex represented by vocabulary index i and the word vertex represented by vocabulary j.

If weighted_edge[i][j] is zero, it means no edge connection is present between the words represented by index i and j.

There is a connection between the words (and thus between i and j which represents them) if the words co-occur within a window of a specified 'window_size' in the processed_text.

The value of the weighted_edge[i][j] is increased by (1/(distance between positions of words currently represented by i and j)) for every connection discovered between the same words in different locations of the text. 

The covered_coocurrences list (which is contain the list of pairs of absolute positions in processed_text of the words whose coocurrence at that location is already checked) is managed so that the same two words located in the same positions in processed_text are not repetitively counted while sliding the window one text unit at a time.

The score of all vertices are intialized to one. 

Self-connections are not considered, so weighted_edge[i][i] will be zero.

In [679]:
import numpy as np
import math
vocab_len = len(vocabulary)

weighted_edge = np.zeros((vocab_len,vocab_len),dtype=np.float32)

score = np.zeros((vocab_len),dtype=np.float32)
window_size = 3
covered_coocurrences = []

for i in range(0,vocab_len):
    score[i]=1
    for j in range(0,vocab_len):
        if j==i:
            weighted_edge[i][j]=0
        else:
            for window_start in range(0,(len(processed_text)-window_size)):
                
                window_end = window_start+window_size
                
                window = processed_text[window_start:window_end]
                
                if (vocabulary[i] in window) and (vocabulary[j] in window):
                    
                    index_of_i = window_start + window.index(vocabulary[i])
                    index_of_j = window_start + window.index(vocabulary[j])
                    
                    # index_of_x is the absolute position of the xth term in the window 
                    # (counting from 0) 
                    # in the processed_text
                      
                    if [index_of_i,index_of_j] not in covered_coocurrences:
                        weighted_edge[i][j]+=1/math.fabs(index_of_i-index_of_j)
                        covered_coocurrences.append([index_of_i,index_of_j])


### Calculating weighted summation of connections of a vertex

inout[i] will contain the sum of all the undirected connections\edges associated withe the vertex represented by i.

In [680]:
inout = np.zeros((vocab_len),dtype=np.float32)

for i in range(0,vocab_len):
    for j in range(0,vocab_len):
        inout[i]+=weighted_edge[i][j]

### Scoring Vertices

The formula used for scoring a vertex represented by i is:

score[i] = (1-d) + d x [ Summation(j) ( (weighted_edge[i][j]/inout[j]) x score[j] ) ] where j belongs to the list of vertieces that has a connection with i. 

d is the damping factor.

The score is iteratively updated until convergence. 

In [681]:
MAX_ITERATIONS = 50
d=0.85
threshold = 0.0001 #convergence threshold

for iter in range(0,MAX_ITERATIONS):
    prev_score = np.copy(score)
    
    for i in range(0,vocab_len):
        
        summation = 0
        for j in range(0,vocab_len):
            if weighted_edge[i][j] != 0:
                summation += (weighted_edge[i][j]/inout[j])*score[j]
                
        score[i] = (1-d) + d*(summation)
    
    if np.sum(np.fabs(prev_score-score)) <= threshold: #convergence condition
        print("Converging at iteration "+str(iter)+"....")
        break

Converging at iteration 42....


In [682]:
for i in range(0,vocab_len):
    print("Score of "+vocabulary[i]+": "+str(score[i]))

Score of summer: 1.2996584
Score of health: 0.3496778
Score of problem: 8.976176
Score of cast: 0.42226073
Score of strand: 0.37223458
Score of freakinglyyy: 0.36460793
Score of heart: 0.32495236
Score of year: 35.935574
Score of druers: 0.36084834
Score of dey: 0.29017007
Score of fan: 5.5315537
Score of guard: 0.3351998
Score of rusk: 1.1477805
Score of force: 1.4817696
Score of sieve: 0.37827507
Score of appointment: 0.3179331
Score of momma: 0.31390032
Score of lint: 1.7964054
Score of energy: 0.3144197
Score of prodcut: 0.31876722
Score of sliding: 0.325093
Score of record: 0.47977546
Score of availability: 0.3623059
Score of w8less: 0.58691466
Score of suggestion: 0.36424565
Score of afterward: 0.48228794
Score of number: 1.3013717
Score of minipro: 0.3754834
Score of fill: 0.3886885
Score of basis: 0.89302135
Score of jojoba: 0.31259298
Score of gripe: 0.33096007
Score of short-length: 0.3689216
Score of dog: 1.8485204
Score of fuss: 0.33526725
Score of print: 0.32273552
Score o

### Phrase Partiotioning

Paritioning lemmatized_text into phrases using the stopwords in it as delimeters.
The phrases are also candidates for keyphrases to be extracted. 

In [649]:
phrases = []
unit_word = ['time', 'price', 'product', 'style', 'year', 'month']

phrase = " "
for word in lemmatized_text:
    # 处理一些常见词    
    if word in unit_word:
        phrases.append([word])
        phrase = " "
    elif word in stopwords_plus:
        if phrase!= " ":
            phrases.append(str(phrase).strip().split())
        phrase = " "
    elif word not in stopwords_plus:
        phrase+=str(word)
        phrase+=" "

print("Partitioned Phrases (Candidate Keyphrases): \n")
print(phrases)

Partitioned Phrases (Candidate Keyphrases): 

[['model'], ['amount'], ['power'], ['hand'], ['style'], ['style'], ['left'], ['flaw'], ['hairdryer'], ['year'], ['month'], ['day'], ['week'], ['isnt'], ['didnt'], ['style'], ['voltage'], ['product'], ['client'], ['time'], ['lot'], ['power'], ['month'], ['price'], ['morning'], ['thing'], ['ear'], ['drawback'], ['hand'], ['mousse'], ['oil'], ['struggle'], ['groove'], ['hairdryer'], ['year'], ['smoking'], ['fire'], ['flow'], ['button'], ['flow'], ['yr'], ['weekend'], ['attempt'], ['refund/repair'], ['unit'], ['wonder'], ['company'], ['space'], ['bathroom'], ['blowdryer'], ['8-10years'], ['*very'], ['time'], ['minute'], ['10-12minutes'], ['thing'], ['texture'], ['blowdry'], ['time'], ['week'], ['job'], ['job'], ['lot'], ['thing'], ['bonnet'], ['job'], ['buildup'], ['ability'], ['hand'], ['hook'], ['diffuser'], ['blowdryer'], ['job'], ['thing'], ['comb'], ['blowdry'], ['amount'], ['wife'], ['time'], ['money'], ['stream'], ['item'], ['order'], ['

### Create a list of unique phrases.
Repeating phrases\keyphrase candidates has no purpose here, anymore.

In [650]:
unique_phrases = []

for phrase in phrases:
    if phrase not in unique_phrases:
        unique_phrases.append(phrase)

print("Unique Phrases (Candidate Keyphrases): \n")
print(unique_phrases)

Unique Phrases (Candidate Keyphrases): 

[['model'], ['amount'], ['power'], ['hand'], ['style'], ['left'], ['flaw'], ['hairdryer'], ['year'], ['month'], ['day'], ['week'], ['isnt'], ['didnt'], ['voltage'], ['product'], ['client'], ['time'], ['lot'], ['price'], ['morning'], ['thing'], ['ear'], ['drawback'], ['mousse'], ['oil'], ['struggle'], ['groove'], ['smoking'], ['fire'], ['flow'], ['button'], ['yr'], ['weekend'], ['attempt'], ['refund/repair'], ['unit'], ['wonder'], ['company'], ['space'], ['bathroom'], ['blowdryer'], ['8-10years'], ['*very'], ['minute'], ['10-12minutes'], ['texture'], ['blowdry'], ['job'], ['bonnet'], ['buildup'], ['ability'], ['hook'], ['diffuser'], ['comb'], ['wife'], ['money'], ['stream'], ['item'], ['order'], ['photo'], ['charm'], ['makeover'], ['instance'], ['replica'], ['adjustment'], ['fan'], ['person'], ['finger'], ['sliding'], ['watt'], ['controller'], ['investment'], ['dust'], ['suitcase'], ['frill'], ['appointment'], ['piece'], ['shade'], ['matte', 'ton

### Thinning the list of candidate-keyphrases.
Removing single word keyphrases-candidates that are present multi-word alternatives.

In [651]:
for word in vocabulary:
    #print word
    for phrase in unique_phrases:
        if (word in phrase) and ([word] in unique_phrases) and (len(phrase)>1):
            #if len(phrase)>1 then the current phrase is multi-worded.
            #if the word in vocabulary is present in unique_phrases as a single-word-phrase
            # and at the same time present as a word within a multi-worded phrase,
            # then I will remove the single-word-phrase from the list.
            unique_phrases.remove([word])
            
print("Thinned Unique Phrases (Candidate Keyphrases): \n")
print(unique_phrases)  

Thinned Unique Phrases (Candidate Keyphrases): 

[['amount'], ['style'], ['left'], ['flaw'], ['year'], ['month'], ['isnt'], ['didnt'], ['product'], ['client'], ['time'], ['lot'], ['price'], ['morning'], ['thing'], ['ear'], ['drawback'], ['mousse'], ['struggle'], ['groove'], ['smoking'], ['yr'], ['weekend'], ['attempt'], ['refund/repair'], ['unit'], ['wonder'], ['8-10years'], ['*very'], ['minute'], ['10-12minutes'], ['texture'], ['blowdry'], ['job'], ['buildup'], ['ability'], ['stream'], ['order'], ['photo'], ['charm'], ['makeover'], ['instance'], ['replica'], ['adjustment'], ['person'], ['sliding'], ['controller'], ['investment'], ['dust'], ['suitcase'], ['frill'], ['appointment'], ['shade'], ['matte', 'tone'], ['type'], ['slippery'], ['snug'], ['motor'], ['spot'], ['case'], ['delivery'], ['high/hot'], ['salonsalon'], ['friend', 'house'], ['budget'], ['sassoon'], ['load'], ['kind'], ['shin'], ['walmart', 'bargain'], ['bang'], ['blow-drying'], ['winter'], ['blaze'], ['glory'], ['refuse'

### Scoring Keyphrases

Scoring the phrases (candidate keyphrases) and building up a list of keyphrases\keywords
by listing untokenized versions of tokenized phrases\candidate-keyphrases.
Phrases are scored by adding the score of their members (words\text-units that were ranked by the graph algorithm)


In [652]:
phrase_scores = []
keywords = []
for phrase in unique_phrases:
    phrase_score=0
    keyword = ''
    for word in phrase:
        keyword += str(word)
        keyword += " "
        phrase_score+=score[vocabulary.index(word)]
    phrase_scores.append(phrase_score)
    keywords.append(keyword.strip())

i=0
for keyword in keywords:
    print ("Keyword: '"+str(keyword)+"', Score: "+str(phrase_scores[i]))
    i+=1

Keyword: 'amount', Score: 2.1058425903320312
Keyword: 'style', Score: 7.2099289894104
Keyword: 'left', Score: 0.3726743757724762
Keyword: 'flaw', Score: 0.9532197117805481
Keyword: 'year', Score: 25.97850799560547
Keyword: 'month', Score: 12.686555862426758
Keyword: 'isnt', Score: 0.3624672293663025
Keyword: 'didnt', Score: 0.8624895215034485
Keyword: 'product', Score: 25.55835723876953
Keyword: 'client', Score: 0.5462518930435181
Keyword: 'time', Score: 25.011762619018555
Keyword: 'lot', Score: 8.329447746276855
Keyword: 'price', Score: 17.667083740234375
Keyword: 'morning', Score: 1.1640429496765137
Keyword: 'thing', Score: 10.026325225830078
Keyword: 'ear', Score: 0.8045633435249329
Keyword: 'drawback', Score: 0.8815819025039673
Keyword: 'mousse', Score: 0.4094676375389099
Keyword: 'struggle', Score: 0.4453640878200531
Keyword: 'groove', Score: 0.41618090867996216
Keyword: 'smoking', Score: 0.5469945073127747
Keyword: 'yr', Score: 0.612798810005188
Keyword: 'weekend', Score: 0.63632

### Ranking Keyphrases

Ranking keyphrases based on their calculated scores. Displaying top keywords_num no. of keyphrases.

In [653]:
sorted_index = np.flip(np.argsort(phrase_scores),0)

keywords_num = 100

print("Keywords:\n")

for i in range(0,keywords_num):
    print(str(keywords[sorted_index[i]])+", Score: "+str(phrase_scores[sorted_index[i]]) )

Keywords:

year, Score: 25.97850799560547
product, Score: 25.55835723876953
time, Score: 25.011762619018555
power wife, Score: 20.85944700241089
hairdryer model, Score: 18.951599597930908
power level, Score: 18.245778560638428
price, Score: 17.667083740234375
blowing power, Score: 16.16311851143837
volumizing finger diffuser, Score: 14.751794248819351
allure button, Score: 13.12741768360138
diffuser combo, Score: 12.76504573225975
month, Score: 12.686555862426758
button position, Score: 12.592267632484436
rocker button, Score: 12.587506115436554
button interfere, Score: 12.407795757055283
speed/temperature button, Score: 12.352531522512436
button delivers, Score: 12.351224601268768
hand piece, Score: 10.67069387435913
thing, Score: 10.026325225830078
star customer service, Score: 9.622106790542603
hand today, Score: 9.54112434387207
customer service problem, Score: 9.490748167037964
mother day, Score: 9.397869229316711
bathroom space, Score: 9.199326276779175
salon brand, Score: 8.7408

## 写入文件进行保存

In [654]:
import csv

In [655]:
with open("keyword_5_noun.csv","w") as csvfile: 
    writer = csv.writer(csvfile)
    writer.writerow(["keyword","score"])
    for i in range(0,keywords_num):
        writer.writerow([str(keywords[sorted_index[i]]), phrase_scores[sorted_index[i]]])