# **Amazon: Reviews (Sports and Outdoors)**

In [2]:
# import libraries
import nltk
import re, random, os
import string, pprint
import matplotlib.pyplot as plt
import seaborn as sns

# spacy for basic preprocessing, optional, can use nltk as well (lemmatisation etc.)
# import spacy

# gensim for LDA
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
#from pyLDAvis import gensim_models as pg
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

import pandas as pd

In [3]:


data = ('Sports_and_Outdoors_5.json.gz')

FileNotFoundError: [Errno 2] No such file or directory: 'Sports_and_Outdoors_5.json.gz'

In [None]:
print(data.head(2))
print(len(data))
print('Unique Products')
print(len(data.groupby('asin')))
print('Unique Users')
print(len(data.groupby('reviewerID')))

   overall  verified  reviewTime      reviewerID        asin reviewerName  \
0      5.0      True  06 3, 2015  A180LQZBUWVOLF  0000032034   Michelle A   
1      1.0      True  04 1, 2015   ATMFGKU5SVEYY  0000032034    Crystal R   

                                          reviewText  \
0            What a spectacular tutu! Very slimming.   
1  What the heck? Is this a tutu for nuns? I know...   

                     summary  unixReviewTime style vote image  
0                 Five Stars      1433289600   NaN  NaN   NaN  
1  Is this a tutu for nuns?!      1427846400   NaN  NaN   NaN  
2839940
Unique Products
104687
Unique Users
332447


# Preprocessing and cleaning the data

In [None]:
## Creating a copy
process_reviews=data.copy()

## Checking for null values
process_reviews.isnull().sum()

overall                 0
verified                0
reviewTime              0
reviewerID              0
asin                    0
reviewerName          279
reviewText           1114
summary               611
unixReviewTime          0
style             1242772
vote              2461060
image             2775405
dtype: int64

In [None]:
## Clear NAs in reviewText
process_reviews['reviewText']=process_reviews['reviewText'].fillna('Missing')

In [None]:
## Combine review text and summary column
process_reviews['reviews']=process_reviews['reviewText']+process_reviews['summary']
process_reviews=process_reviews.drop(['reviewText', 'summary'], axis=1)
process_reviews = process_reviews.drop(columns=['style', 'vote', 'image'])

In [None]:
## Figuring out the distribution of categories
process_reviews['overall'].value_counts()

5.0    1921398
4.0     495533
3.0     210215
1.0     111157
2.0     101637
Name: overall, dtype: int64

In [None]:
def f(row):
    if row['overall']== 3.0:
        val = 'Neutral'
    elif row['overall']==1.0 or row['overall']==2.0:
        val = 'Negative'
    elif row['overall'] ==4.0 or row['overall']==5.0:
        val = 'Positive'
    else:
        val = -1
    return val

process_reviews['Sentiment'] = process_reviews.apply(f, axis = 1)
process_reviews.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,unixReviewTime,reviews,Sentiment
0,5.0,True,"06 3, 2015",A180LQZBUWVOLF,32034,Michelle A,1433289600,What a spectacular tutu! Very slimming.Five Stars,Positive
1,1.0,True,"04 1, 2015",ATMFGKU5SVEYY,32034,Crystal R,1427846400,What the heck? Is this a tutu for nuns? I know...,Negative
2,5.0,True,"01 13, 2015",A1QE70QBJ8U6ZG,32034,darla Landreth,1421107200,Exactly what we were looking for!Five Stars,Positive
3,5.0,True,"12 23, 2014",A22CP6Z73MZTYU,32034,L. Huynh,1419292800,I used this skirt for a Halloween costume and ...,Positive
4,4.0,True,"12 15, 2014",A22L28G8NRNLLN,32034,McKenna,1418601600,This is thick enough that you can't see throug...,Positive


In [None]:
## New data frame which has date and year
new = process_reviews["reviewTime"].str.split(",", n = 1, expand = True)

## Separate date column from new data frame
process_reviews["date"]= new[0]

## Separate year column from new data frame
process_reviews["year"]= new[1]
process_reviews=process_reviews.drop(['reviewTime'], axis=1)

## Splitting the date
new1 = process_reviews["date"].str.split(" ", n = 1, expand = True)

## Adding month to the main dataset
process_reviews["month"]= new1[0]

## Adding day to the main dataset
process_reviews["day"]= new1[1]
process_reviews=process_reviews.drop(['date'], axis=1)

In [None]:
## Removing unnecessary columns
process_reviews=process_reviews.drop(['reviewerName','unixReviewTime','reviewerID','asin'], axis=1)

## Creating a copy
clean_reviews=process_reviews.copy()

In [None]:
import re
import string

def review_cleaning(text):

    text = str(text).lower() # lowercase text
    text = re.sub('\[.*?\]', '', text) # remove text in square brackets
    text = re.sub('https?://\S+|www\.\S+', '', text) # remove links
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text) # remove punctuation
    text = re.sub('\n', '', text) # remove words containing numbers
    text = re.sub('\w*\d\w*', '', text)
    return text

process_reviews['reviews']=process_reviews['reviews'].apply(lambda x:review_cleaning(x))
process_reviews.head()

Unnamed: 0,overall,verified,reviews,Sentiment,year,month,day
0,5.0,True,what a spectacular tutu very slimmingfive stars,Positive,2015,6,3
1,1.0,True,what the heck is this a tutu for nuns i know y...,Negative,2015,4,1
2,5.0,True,exactly what we were looking forfive stars,Positive,2015,1,13
3,5.0,True,i used this skirt for a halloween costume and ...,Positive,2014,12,23
4,4.0,True,this is thick enough that you cant see through...,Positive,2014,12,15


In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jiaotianyu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
## Remove all the stop words in the review column
stop_words= ['yourselves', 'between', 'whom', 'itself', 'is', "she's", 'up', 'herself', 'here', 'your', 'each',
             'we', 'he', 'my', "you've", 'having', 'in', 'both', 'for', 'themselves', 'are', 'them', 'other',
             'and', 'an', 'during', 'their', 'can', 'yourself', 'she', 'until', 'so', 'these', 'ours', 'above',
             'what', 'while', 'have', 're', 'more', 'only', "needn't", 'when', 'just', 'that', 'were', "don't",
             'very', 'should', 'any', 'y', 'isn', 'who',  'a', 'they', 'to', 'too', "should've", 'has', 'before',
             'into', 'yours', "it's", 'do', 'against', 'on',  'now', 'her', 've', 'd', 'by', 'am', 'from',
             'about', 'further', "that'll", "you'd", 'you', 'as', 'how', 'been', 'the', 'or', 'doing', 'such',
             'his', 'himself', 'ourselves',  'was', 'through', 'out', 'below', 'own', 'myself', 'theirs',
             'me', 'why', 'once',  'him', 'than', 'be', 'most', "you'll", 'same', 'some', 'with', 'few', 'it',
             'at', 'after', 'its', 'which', 'there','our', 'this', 'hers', 'being', 'did', 'of', 'had', 'under',
             'over','again', 'where', 'those', 'then', "you're", 'i', 'because', 'does', 'all']

process_reviews['reviews'] = process_reviews['reviews'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

# Plotting some visualizations

In [None]:
## Year vs Sentiment count

import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(12, 6))

process_reviews.groupby(['year', 'sentiment'])['sentiment'].count().unstack().plot(legend=True, ax=ax)
ax.set_title('Year vs Sentiment count')
ax.set_xlabel('Year')
ax.set_ylabel('Sentiment count')
plt.savefig('plot.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
## Review Rating Distribution

import plotly.graph_objs as go
import plotly.offline as pyo

data = [go.Histogram(x=process_reviews['overall'])]

layout = go.Layout(
    title='Review Rating Distribution',
    xaxis=dict(title='Rating'),
    yaxis=dict(title='Count'),
)

fig = go.Figure(data=data, layout=layout)
pyo.iplot(fig)

In [None]:
## Filtering data

review_pos = process_reviews[process_reviews["sentiment"]=='Positive'].dropna()
review_neg = process_reviews[process_reviews["sentiment"]=='Negative'].dropna()

In [None]:
## Wordcloud-Positive reviews

from wordcloud import WordCloud
from wordcloud import WordCloud, STOPWORDS

text = review_pos["reviews"]
wordcloud = WordCloud(
    width = 1000,
    height = 500,
    background_color = 'black',
    stopwords = STOPWORDS).generate(str(text))
fig = plt.figure(
    figsize = (40, 30),
    facecolor = 'k',
    edgecolor = 'k')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

In [None]:
## Wordcloud-Negative reviews

from wordcloud import WordCloud
from wordcloud import WordCloud, STOPWORDS

text = review_neg["reviews"]
wordcloud = WordCloud(
    width = 1000,
    height = 500,
    background_color = 'black',
    stopwords = stop_words).generate(str(text))
fig = plt.figure(
    figsize = (40, 30),
    facecolor = 'k',
    edgecolor = 'k')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

In [None]:
## Bigram analysis

from collections import defaultdict
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
from textblob import TextBlob
from plotly import tools
from plotly.subplots import make_subplots
import plotly.graph_objs as go
from plotly.offline import iplot
%matplotlib inline

def horizontal_bar_chart(df, color):
    trace = go.Bar(
        y=df["word"][::-1],
        x=df["wordcount"][::-1],
        showlegend=False,
        orientation = 'h',
        marker=dict(
            color=color,
        ),
    )
    return trace

## Custom function for ngram generation
def generate_ngrams(text, n_gram=1):
    token = [token for token in text.lower().split(" ") if token != "" if token not in STOPWORDS]
    ngrams = zip(*[token[i:] for i in range(n_gram)])
    return [" ".join(ngram) for ngram in ngrams]

## Get the bar chart from positive reviews
freq_dict = defaultdict(int)
for sent in review_pos["reviews"]:
    for word in generate_ngrams(sent,2):
        freq_dict[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
trace1 = horizontal_bar_chart(fd_sorted.head(25), 'green')

## Get the bar chart from negative reviews
freq_dict = defaultdict(int)
for sent in review_neg["reviews"]:
    for word in generate_ngrams(sent,2):
        freq_dict[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
trace2 = horizontal_bar_chart(fd_sorted.head(25), 'blue')

## Creating two subplots
fig = make_subplots(rows=2, cols=1, vertical_spacing=0.04,horizontal_spacing=0.25,
                          subplot_titles=["Bigram plots of Positive reviews",
                                          "Bigram plots of Negative reviews"
                                          ])
fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 2, 1)

## Plotting the graphs
fig['layout'].update(height=1000, width=800, paper_bgcolor='rgb(233,233,233)', title="Bigram Plots")
iplot(fig, filename='word-plots')

# Topic Modeling

In [None]:

process_reviews['Num_words_text'] = process_reviews['reviews'].apply(lambda x:len(str(x).split()))
print('-------process_reviewsset --------')
print(process_reviews['overall'].value_counts())
print(len(process_reviews))
print('-------------------------')
max_review_process_reviews_sentence_length  = process_reviews['Num_words_text'].max()

mask = (process_reviews['Num_words_text'] < 100) & (process_reviews['Num_words_text'] >15)
process_reviews_short_reviews = process_reviews[mask]
process_reviews_sampled = process_reviews_short_reviews.groupby('overall').apply(lambda x: x.sample(n=20000)).reset_index(drop = True)

print('No of Short reviews')
print(len(process_reviews_short_reviews))

-------process_reviewsset --------
5.0    1921398
4.0     495533
3.0     210215
1.0     111157
2.0     101637
Name: overall, dtype: int64
2839940
-------------------------
No of Short reviews
1498634


In [None]:
process_reviews_short_reviews.head()

Unnamed: 0,overall,verified,reviews,Sentiment,year,month,day,Num_words_text
1,1.0,True,what the heck is this a tutu for nuns i know y...,Negative,2015,4,1,44
3,5.0,True,i used this skirt for a halloween costume and ...,Positive,2014,12,23,92
4,4.0,True,this is thick enough that you cant see through...,Positive,2014,12,15,37
8,3.0,True,more of a road map than a useful topographical...,Neutral,2016,12,12,45
9,5.0,True,comprehensive atlas very happy with how much d...,Positive,2016,8,13,39


In [None]:
process_reviews_short_reviews.to_csv('process_reviews_short_reviews.csv',index=True)


In [None]:
process_reviews_short_reviews.head()

Unnamed: 0,overall,verified,reviews,Sentiment,year,month,day,Num_words_text
1,1.0,True,what the heck is this a tutu for nuns i know y...,Negative,2015,4,1,44
3,5.0,True,i used this skirt for a halloween costume and ...,Positive,2014,12,23,92
4,4.0,True,this is thick enough that you cant see through...,Positive,2014,12,15,37
8,3.0,True,more of a road map than a useful topographical...,Neutral,2016,12,12,45
9,5.0,True,comprehensive atlas very happy with how much d...,Positive,2016,8,13,39


In [None]:
process_reviews_short_reviews['year'] = pd.to_datetime(process_reviews_short_reviews['year'])
before = process_reviews_short_reviews[process_reviews_short_reviews['year'] < '2012']
after = process_reviews_short_reviews[(process_reviews_short_reviews['year'] > '2013') & (process_reviews_short_reviews['year'] < '2015')]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  process_reviews_short_reviews['year'] = pd.to_datetime(process_reviews_short_reviews['year'])


In [None]:
before.head()

Unnamed: 0,overall,verified,reviews,Sentiment,year,month,day,Num_words_text
27,5.0,True,arizona is a spectacular state there is so mu...,Positive,2008-01-01,9,6,64
67,1.0,True,waste of money in my opinion fails to show im...,Negative,2010-01-01,5,18,16
68,5.0,True,i have already made great use of this gazeteer...,Positive,2010-01-01,2,6,42
70,5.0,False,ok i am new to the world of fly fishing but ...,Positive,2008-01-01,8,8,63
119,4.0,True,these bands are high quality and very easy to ...,Positive,2011-01-01,3,25,43


In [None]:
after.head()

Unnamed: 0,overall,verified,reviews,Sentiment,year,month,day,Num_words_text
3,5.0,True,i used this skirt for a halloween costume and ...,Positive,2014-01-01,12,23,92
4,4.0,True,this is thick enough that you cant see through...,Positive,2014-01-01,12,15,37
25,5.0,True,these gazetters are amazing i own them for ny ...,Positive,2014-01-01,11,9,80
60,4.0,True,i have several of these they all have proven v...,Positive,2014-01-01,4,13,23
61,5.0,True,i hunt and fish in arizona and i need a map th...,Positive,2014-01-01,3,21,69


In [None]:
## Before 2012

In [None]:
# before 2012
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
# function to remove stopwords
def remove_stopwords(text):
    textArr = text.split(' ')
    rem_text = " ".join([i for i in textArr if i not in stop_words])
    return rem_text

# remove stopwords from the text
before['reviews']=before['reviews'].apply(remove_stopwords)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  before['reviews']=before['reviews'].apply(remove_stopwords)


In [None]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

def lemmatization(texts,allowed_postags=['NOUN', 'ADJ']):
       output = []
       for sent in texts:
                doc = nlp(sent)
                output.append([token.lemma_ for token in doc if token.pos_ in allowed_postags ])
       return output

In [None]:
text_list=before['reviews'].tolist()
print(text_list[1])
tokenized_reviews = lemmatization(text_list)
print(tokenized_reviews[1])

waste money opinion  fails show important mountains ridges terraindisapointing detail
['money', 'opinion', 'important', 'mountain', 'ridge', 'detail']


In [None]:
dictionary = corpora.Dictionary(tokenized_reviews)
doc_term_matrix = [dictionary.doc2bow(rev) for rev in tokenized_reviews]

In [None]:
# Creating the object for LDA model using gensim library
LDA = gensim.models.ldamodel.LdaModel

# Build LDA model
lda_model = LDA(corpus=doc_term_matrix,
                id2word=dictionary,
                num_topics=10,
                random_state=100,
                chunksize=1000,
                passes=50,
                iterations=100)

In [None]:
lda_model.print_topics(10)

[(0,
  '0.058*"good" + 0.044*"great" + 0.041*"price" + 0.037*"quality" + 0.035*"product" + 0.017*"gun" + 0.016*"case" + 0.012*"high" + 0.011*"nice" + 0.011*"money"'),
 (1,
  '0.153*"light" + 0.036*"battery" + 0.036*"bright" + 0.026*"tent" + 0.023*"road" + 0.018*"chain" + 0.018*"basket" + 0.018*"helmet" + 0.013*"leak" + 0.011*"dark"'),
 (2,
  '0.033*"great" + 0.025*"time" + 0.022*"good" + 0.019*"product" + 0.018*"year" + 0.015*"use" + 0.014*"easy" + 0.012*"old" + 0.012*"day" + 0.010*"last"'),
 (3,
  '0.034*"short" + 0.021*"sight" + 0.017*"pad" + 0.016*"comfortable" + 0.013*"front" + 0.011*"scope" + 0.011*"ring" + 0.011*"padding" + 0.010*"little" + 0.010*"mat"'),
 (4,
  '0.029*"band" + 0.027*"tool" + 0.019*"great" + 0.018*"easy" + 0.018*"use" + 0.017*"workout" + 0.017*"kit" + 0.017*"wrist" + 0.016*"work" + 0.016*"product"'),
 (5,
  '0.094*"bike" + 0.028*"tire" + 0.021*"easy" + 0.021*"ride" + 0.020*"bar" + 0.017*"mile" + 0.016*"seat" + 0.015*"great" + 0.014*"tube" + 0.012*"mountain"'),
 (

In [None]:
## 2013 - 2015

In [None]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
# function to remove stopwords
def remove_stopwords(text):
    textArr = text.split(' ')
    rem_text = " ".join([i for i in textArr if i not in stop_words])
    return rem_text

# remove stopwords from the text
after['reviews']=after['reviews'].apply(remove_stopwords)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after['reviews']=after['reviews'].apply(remove_stopwords)


In [None]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

def lemmatization(texts,allowed_postags=['NOUN', 'ADJ']):
       output = []
       for sent in texts:
                doc = nlp(sent)
                output.append([token.lemma_ for token in doc if token.pos_ in allowed_postags ])
       return output

In [None]:
text_list=after['reviews'].tolist()
print(text_list[1])
tokenized_reviews = lemmatization(text_list)
print(tokenized_reviews[1])

thick enough cant see long sure check dimensions ended cutting shorterthis thick enough cant see 
['thick', 'long', 'sure', 'check', 'dimension', 'thick']


In [None]:
dictionary = corpora.Dictionary(tokenized_reviews)
doc_term_matrix = [dictionary.doc2bow(rev) for rev in tokenized_reviews]

In [None]:
# Creating the object for LDA model using gensim library
LDA = gensim.models.ldamodel.LdaModel

# Build LDA model
lda_model = LDA(corpus=doc_term_matrix,
                id2word=dictionary,
                num_topics=10,
                random_state=100,
                chunksize=1000,
                passes=50,
                iterations=100)

In [None]:
lda_model.print_topics(10)

[(0,
  '0.071*"knife" + 0.038*"glove" + 0.038*"gun" + 0.034*"ball" + 0.028*"hand" + 0.024*"sharp" + 0.023*"blade" + 0.022*"good" + 0.019*"belt" + 0.018*"great"'),
 (1,
  '0.066*"size" + 0.064*"fit" + 0.043*"small" + 0.033*"large" + 0.027*"short" + 0.026*"comfortable" + 0.024*"nice" + 0.022*"big" + 0.021*"perfect" + 0.020*"tight"'),
 (2,
  '0.047*"bright" + 0.035*"sight" + 0.032*"shirt" + 0.025*"shoe" + 0.022*"kid" + 0.019*"cover" + 0.017*"replacement" + 0.017*"rifle" + 0.017*"front" + 0.015*"arrow"'),
 (3,
  '0.030*"warm" + 0.028*"time" + 0.019*"band" + 0.019*"battery" + 0.015*"cap" + 0.013*"first" + 0.013*"tube" + 0.012*"week" + 0.012*"lightweight" + 0.012*"towel"'),
 (4,
  '0.031*"water" + 0.029*"year" + 0.026*"great" + 0.026*"bottle" + 0.024*"old" + 0.014*"good" + 0.013*"time" + 0.012*"target" + 0.011*"use" + 0.011*"son"'),
 (5,
  '0.086*"good" + 0.064*"price" + 0.064*"quality" + 0.048*"great" + 0.041*"star" + 0.026*"money" + 0.022*"grip" + 0.018*"high" + 0.017*"nice" + 0.015*"cheap

In [None]:
#sample

In [None]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
# function to remove stopwords
def remove_stopwords(text):
    textArr = text.split(' ')
    rem_text = " ".join([i for i in textArr if i not in stop_words])
    return rem_text

# remove stopwords from the text
process_reviews_sampled['reviews']=process_reviews_sampled['reviews'].apply(remove_stopwords)

In [None]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

def lemmatization(texts,allowed_postags=['NOUN', 'ADJ']):
       output = []
       for sent in texts:
                doc = nlp(sent)
                output.append([token.lemma_ for token in doc if token.pos_ in allowed_postags ])
       return output

In [None]:
text_list=process_reviews_sampled['reviews'].tolist()
print(text_list[1])
tokenized_reviews = lemmatization(text_list)
print(tokenized_reviews[1])

fits maxpedition falcon ii holds water like accurate description amazon give low rating come dust cap already installedbuilt ini buy separately feel camelbak make money hate greedit holds water like
['water', 'accurate', 'description', 'low', 'rating', 'dust', 'cap', 'ini', 'camelbak', 'money', 'hate', 'greedit', 'water']


In [None]:
dictionary = corpora.Dictionary(tokenized_reviews)
doc_term_matrix = [dictionary.doc2bow(rev) for rev in tokenized_reviews]

In [None]:
# Creating the object for LDA model using gensim library
LDA = gensim.models.ldamodel.LdaModel

# Build LDA model
lda_model = LDA(corpus=doc_term_matrix,
                id2word=dictionary,
                num_topics=10,
                random_state=100,
                chunksize=1000,
                passes=50,
                iterations=100)

In [None]:
lda_model.print_topics(10)

[(0,
  '0.032*"easy" + 0.030*"gun" + 0.020*"scope" + 0.019*"holster" + 0.019*"good" + 0.018*"sight" + 0.017*"clean" + 0.016*"range" + 0.015*"rifle" + 0.015*"target"'),
 (1,
  '0.077*"water" + 0.049*"bottle" + 0.022*"tire" + 0.016*"top" + 0.015*"room" + 0.015*"cap" + 0.015*"mile" + 0.015*"inch" + 0.013*"backpack" + 0.013*"tube"'),
 (2,
  '0.073*"good" + 0.062*"great" + 0.061*"product" + 0.055*"price" + 0.051*"quality" + 0.017*"excellent" + 0.015*"nice" + 0.012*"strong" + 0.012*"high" + 0.011*"work"'),
 (3,
  '0.080*"bike" + 0.039*"grip" + 0.038*"sturdy" + 0.034*"tool" + 0.024*"sharp" + 0.022*"solid" + 0.017*"seat" + 0.017*"kit" + 0.016*"bar" + 0.016*"clip"'),
 (4,
  '0.040*"old" + 0.037*"year" + 0.027*"comfortable" + 0.027*"sock" + 0.026*"glove" + 0.024*"warm" + 0.021*"pair" + 0.020*"color" + 0.017*"son" + 0.017*"shirt"'),
 (5,
  '0.052*"fit" + 0.050*"small" + 0.049*"size" + 0.038*"great" + 0.035*"perfect" + 0.035*"knife" + 0.028*"big" + 0.027*"large" + 0.026*"good" + 0.024*"little"'),


In [None]:
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, doc_term_matrix, dictionary)
vis

  default_term_info = default_term_info.sort_values(
  -13.51983232]
 [-14.5925232  -14.58750121 -14.40312398 ... -13.60918516 -13.67080549
  -13.51383097]
 [ -5.9229034  -14.59494209 -14.41771546 ... -13.6171076  -13.6717497
  -13.52049366]
 ...
 [-14.57561075 -14.5950509  -14.39171327 ... -13.58607696 -13.67184984
  -13.49984466]
 [-14.57561075 -14.5950509  -14.39171327 ... -13.58607696 -13.67184984
  -13.49984466]
 [-14.57561075 -14.5950509  -14.39171327 ... -13.58607696 -13.67184984
  -13.51983232]
 [-14.5925232  -14.58750121 -14.40312398 ... -13.60918516 -13.67080549
  -13.51383097]
 [ -5.9229034  -14.59494209 -14.41771546 ... -13.6171076  -13.6717497
  -13.52049366]
 ...
 [-14.57561075 -14.5950509  -14.39171327 ... -13.58607696 -13.67184984
  -13.49984466]
 [-14.57561075 -14.5950509  -14.39171327 ... -13.58607696 -13.67184984
  -13.49984466]
 [-14.57561075 -14.5950509  -14.39171327 ... -13.58607696 -13.67184984
  -13.51983232]
 [-14.5925232  -14.58750121 -14.40312398 ... -13.6091

# Word2Vec

In [None]:
from gensim.models.word2vec import Word2Vec
# import libraries
import nltk
import re, random, os
import string, pprint
import matplotlib.pyplot as plt
import seaborn as sns

# spacy for basic preprocessing, optional, can use nltk as well (lemmatisation etc.)
import spacy

# gensim for LDA
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

!pip install pyLDAvis
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
#from pyLDAvis import gensim_models as pg
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

import pandas as pd

In [None]:
model = Word2Vec(sentences=process_reviews_short_reviews['review_processed'].tolist(), vector_size=200, sg=1,min_count=5,window=5,workers=90,seed=10,epochs=128)
model.wv.vectors.shape
#vector_size (int, optional) – Dimensionality of the word vectors.
#min_count (int, optional) – Ignores all words with total frequency lower than this.
#window (int, optional) – Maximum distance between the current and predicted word within a sentence.
#workers (int, optional) – Use these many worker threads to train the model (=faster training with multicore machines).
#sg ({0, 1}, optional) – Training algorithm: 1 for skip-gram; otherwise CBOW.
#epochs (int, optional) – Number of iterations (epochs) over the corpus. (Formerly: iter)

#The meaning of most of the parameters are beyond the scope of this class. If interested, please check the official documentations: https://radimrehurek.com/gensim/models/word2vec.html

In [None]:
model.save('w2v_dr.w2v')

In [None]:
model=Word2Vec.load('w2v_dr.w2v')

In [None]:
vocab = model.wv.index_to_key

In [None]:
len(vocab)

In [None]:
model.wv.most_similar('footwear','use', topn=10)

In [None]:
model.wv.vectors.shape

In [None]:
outdata=pd.DataFrame(model.wv.vectors)

In [None]:
outdata

In [None]:
outdata.to_csv('social.tsv',sep='\t',index=False,header=False)

In [None]:
pd.DataFrame(model.wv.index_to_key).to_csv('social project.tsv',sep='\t',index=False,header=False)