In [50]:
import pandas as pd
from nltk.stem.porter import PorterStemmer
import re

import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer

nltk.download('stopwords')
nltk.download('wordnet')

from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.sparse import coo_matrix
import matplotlib.pyplot as plt
% matplotlib inline



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/viveckh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/viveckh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
UsageError: Line magic function `%` not found.


In [51]:
dataset = pd.read_csv("./reviews.csv", delimiter=',')
dataset.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,38585,129120,2010-10-28,55877,Ritchie,Evelyne is an accommodating host who lives in ...
1,38585,147273,2010-11-30,279973,Cathy,Evelyne was very welcoming to her home; my fri...
2,38585,198797,2011-03-14,411638,,I really enjoyed Evelyne's welcoming and bubbl...
3,38585,201932,2011-03-17,441855,Bill,Very gracious host and was helpful in all aspe...
4,38585,341616,2011-06-28,657560,Joakim,Evelyn was very friendly and easy to comunicat...


In [52]:
dataset_df = dataset[dataset['comments'] != ''].reset_index(drop=True)
dataset_df.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,38585,129120,2010-10-28,55877,Ritchie,Evelyne is an accommodating host who lives in ...
1,38585,147273,2010-11-30,279973,Cathy,Evelyne was very welcoming to her home; my fri...
2,38585,198797,2011-03-14,411638,,I really enjoyed Evelyne's welcoming and bubbl...
3,38585,201932,2011-03-17,441855,Bill,Very gracious host and was helpful in all aspe...
4,38585,341616,2011-06-28,657560,Joakim,Evelyn was very friendly and easy to comunicat...


In [53]:
dataset_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 171351 entries, 0 to 171350
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   listing_id     171351 non-null  int64 
 1   id             171351 non-null  int64 
 2   date           171351 non-null  object
 3   reviewer_id    171351 non-null  int64 
 4   reviewer_name  171350 non-null  object
 5   comments       171301 non-null  object
dtypes: int64(3), object(3)
memory usage: 7.8+ MB


In [54]:
paragraphs_df = dataset_df[['listing_id', 'reviewer_id', 'comments']]

## Word Counts

In [55]:
paragraphs_df['word_count'] = paragraphs_df['comments'].apply(lambda x: len(str(x).split(" ")))
paragraphs_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,listing_id,reviewer_id,comments,word_count
0,38585,55877,Evelyne is an accommodating host who lives in ...,33
1,38585,279973,Evelyne was very welcoming to her home; my fri...,21
2,38585,411638,I really enjoyed Evelyne's welcoming and bubbl...,52
3,38585,441855,Very gracious host and was helpful in all aspe...,18
4,38585,657560,Evelyn was very friendly and easy to comunicat...,31


### Most Common and Uncommon Words

In [56]:
str_of_abstracts = ' '.join(str(abstract) for abstract in paragraphs_df['comments'])
frequency = pd.Series(str_of_abstracts.split(' ')).value_counts()[:20]
frequency

and      393838
to       256341
the      256085
a        223607
was      201584
         126996
in       116978
is       115533
for       99457
The       93376
of        91747
very      76200
we        75533
place     70253
We        66647
I         66474
great     65735
with      65454
stay      62157
our       55460
dtype: int64

In [57]:
frequency = pd.Series(str_of_abstracts.split(' ')).value_counts()[-20:]
frequency

Band-Aid                 1
driveway.\n\nOverall,    1
(bouchon                 1
basics.\n\nAnd           1
cercanos.\nCama          1
trails.\nWould           1
recomandation            1
offer--it's              1
(complimentary!),        1
surface!                 1
\nthank                  1
outrunning               1
\n\nDescription          1
Petanque                 1
representatives          1
\r\n\r\nDavid's          1
fire-stick               1
clincher                 1
successfully).           1
pillows\n-snacks         1
dtype: int64

### Removing stopwords
### Normalizing the words using Stemming and Lemmatization

Stemming normalizes text by removing suffixes.

Lemmatization normalizes text based on the root of the word (more advanced).

Probably better focusing on stemming for colloquial words

In [58]:
stop_words = set(stopwords.words("english"))
contextual_stop_words = ['any']
stop_words = stop_words.union(contextual_stop_words)

In [59]:
def prep_the_corpus(text, stop_word_list):

    text = str(text)
    #Remove punctuations (Match everything not in the alphabet range)
    text = re.sub('[^a-zA-Z]', ' ', text)
    
    #Convert to lowercase
    text = text.lower()
    
    # remove special characters and digits
    text = re.sub("(\\d|\\W)+", " ", text)
    
    ##Convert to list from string
    text = text.split()
    
    ##Stemming
    ps=PorterStemmer()
    
    #Lemmatisation
    lem = WordNetLemmatizer()
    
    text = [lem.lemmatize(word) for word in text if not word in stop_words] 
    text = " ".join(text)
    return text



In [60]:
paragraphs_df['corpus'] = paragraphs_df['comments'].apply(prep_the_corpus, args=(stop_words,))
paragraphs_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,listing_id,reviewer_id,comments,word_count,corpus
0,38585,55877,Evelyne is an accommodating host who lives in ...,33,evelyne accommodating host life beautiful rura...
1,38585,279973,Evelyne was very welcoming to her home; my fri...,21,evelyne welcoming home friend enjoyed company ...
2,38585,411638,I really enjoyed Evelyne's welcoming and bubbl...,52,really enjoyed evelyne welcoming bubbly person...
3,38585,441855,Very gracious host and was helpful in all aspe...,18,gracious host helpful aspect finding home info...
4,38585,657560,Evelyn was very friendly and easy to comunicat...,31,evelyn friendly easy comunicate hous clean mad...


### Vectorization

`max_df` ignoring terms that have a an occurence higher than the given threshold - corpus specific stop words

In [61]:
cv = CountVectorizer(max_df=0.8, stop_words=stop_words, max_features=10000, ngram_range=(1,3))
count_vector = cv.fit_transform(paragraphs_df['corpus'])

In [62]:
# Peepin into the vocabs
list(cv.vocabulary_.keys())[:20]

['evelyne',
 'accommodating',
 'host',
 'life',
 'beautiful',
 'rural',
 'area',
 'outside',
 'asheville',
 'recommend',
 'staying',
 'looking',
 'peace',
 'quiet',
 'natural',
 'setting',
 'accommodating host',
 'area outside',
 'outside asheville',
 'asheville recommend']

### Most frequent topics

In [63]:
def get_top_n_grams(corpus, n, no_of_entries=None):
    """
    n refers to no of grams - uni-grams, bi-grams, tri-grams, etc
    So pass the numbers like 1, 2, 3, etc
    """
    
    vec = CountVectorizer(ngram_range=(n, n), max_features=2500).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
    return pd.DataFrame(words_freq[:no_of_entries])

In [64]:
top_bigrams_df = get_top_n_grams(paragraphs_df['corpus'], 2, 25)
top_bigrams_df

Unnamed: 0,0,1
0,great location,18179
1,would definitely,14652
2,downtown asheville,13867
3,highly recommend,13378
4,place stay,12300
5,great place,11843
6,definitely stay,10932
7,great host,8432
8,close downtown,8398
9,west asheville,8019


## TF-IDF

TF  = freq of term in doc / total number of terms in the doc

IDF = log(total number of docs) / Number of docs with the term

In [65]:
count_vector

<171351x10000 sparse matrix of type '<class 'numpy.int64'>'
	with 5580811 stored elements in Compressed Sparse Row format>

In [66]:
tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf_transformer.fit(count_vector)

# get feature names
feature_names=cv.get_feature_names()

In [67]:
def sort_coo(coo_matrix):
    """To sort the tf_idf result in descending order""" 
    
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

In [68]:
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]
 
    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
 
    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

In [69]:
def extract_keywords(tfidf_transformer, countVectorizer, document):
    
    #generate tf-idf for the given document
    tf_idf_vector = tfidf_transformer.transform(countVectorizer.transform([document]))
    
    #sort the tf-idf vectors by descending order of scores
    sorted_items=sort_coo(tf_idf_vector.tocoo())
    
    #extract only the top n; n here is 10
    keywords=extract_topn_from_vector(feature_names,sorted_items,5)
    
    return document, keywords 
    

### Make the final call

In [78]:
resp_doc, resp_keywords = extract_keywords(tfidf_transformer, cv, paragraphs_df['comments'].values[34521])
print(resp_doc, "\n\n", resp_keywords)

House was great, super comfortable and nicely designed! We had a wonderful weekend, less than 5 minute Uber ride to downtown! Chris was very accommodating and there right on time to let us in the house. Would definitely recommend the Montford house! 

 {'house': 0.263, 'comfortable nicely': 0.263, 'house would definitely': 0.262, 'minute uber ride': 0.233, 'wonderful weekend': 0.232}
