In [11]:
import re
import pandas as pd
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.decomposition import NMF

In [3]:
!pip install vaderSentiment



In [4]:
## Importing data csv file

df = pd.read_csv('Musical_instruments_reviews.csv')

df.reviewText

0        Not much to write about here, but it does exac...
1        The product does exactly as it should and is q...
2        The primary job of this device is to block the...
3        Nice windscreen protects my MXL mic and preven...
4        This pop filter is great. It looks and perform...
                               ...                        
10256              Great, just as expected.  Thank to all.
10257    I've been thinking about trying the Nanoweb st...
10258    I have tried coated strings in the past ( incl...
10259    Well, MADE by Elixir and DEVELOPED with Taylor...
10260    These strings are really quite good, but I wou...
Name: reviewText, Length: 10261, dtype: object

In [5]:
## Cleaning review section

## Removing capital letters

df.reviewText = df.reviewText.str.lower()

## Removing punctuation
df.reviewText = df.reviewText.map(lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', str(x)))
df['reviewText'] = df['reviewText'].map(str)


In [9]:
data = list(df.reviewText)
cv = CountVectorizer(stop_words = 'english', min_df = 0.001)
doc_term = cv.fit_transform(data)

doc_term_df = pd.DataFrame(doc_term.toarray(), columns=cv.get_feature_names())
doc_term_df



Unnamed: 0,00,000,02,09,10,100,1000,10s,11,12,...,yesterday,yeti,young,youtube,zero,zip,zipper,zippers,zone,zoom
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10256,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10257,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10258,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10259,0,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0


In [14]:
nmf = NMF(2, init = 'nndsvda')
nmf.fit(doc_term)

nmf

NMF(init='nndsvda', n_components=2)

In [15]:
topic_term = nmf.components_.round(3)
topic_term.shape

(2, 3643)

In [17]:
topic_term_df = pd.DataFrame(topic_term.round(3),
                            index = ['component_1', 'component2'],
                            columns = cv.get_feature_names())
topic_term_df



Unnamed: 0,00,000,02,09,10,100,1000,10s,11,12,...,yesterday,yeti,young,youtube,zero,zip,zipper,zippers,zone,zoom
component_1,0.134,0.02,0.024,0.013,0.365,0.415,0.019,0.0,0.069,0.251,...,0.002,0.094,0.017,0.117,0.082,0.004,0.004,0.0,0.034,0.095
component2,0.047,0.014,0.0,0.007,0.217,0.087,0.026,0.018,0.072,0.208,...,0.021,0.011,0.022,0.034,0.022,0.002,0.015,0.012,0.009,0.012


In [21]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print('\nTopic ', ix + 1)
        else:
            print('\nTopic: ', topic_names[ix])
        print(', '.join([feature_names[i] for i in topic.argsort()[:-no_top_words -1 : -1]]))
    print('\n')
    return model, feature_names, no_top_words

In [22]:
output = display_topics(nmf, cv.get_feature_names(), 10)
output


Topic  1
amp, pedal, sound, like, use, great, just, good, tone, pedals

Topic  2
guitar, strings, just, string, like, good, sound, use, great, guitars




(NMF(init='nndsvda', n_components=2),
 ['00',
  '000',
  '02',
  '09',
  '10',
  '100',
  '1000',
  '10s',
  '11',
  '12',
  '120',
  '12ax7',
  '13',
  '14',
  '15',
  '150',
  '16',
  '17',
  '18',
  '19',
  '1980',
  '1st',
  '1x12',
  '20',
  '200',
  '2000',
  '2010',
  '2011',
  '2012',
  '2013',
  '2014',
  '21',
  '22',
  '24',
  '25',
  '250',
  '27',
  '28',
  '2i2',
  '2nd',
  '2x',
  '2x12',
  '30',
  '300',
  '32',
  '335',
  '34',
  '35',
  '3rd',
  '40',
  '400',
  '42',
  '45',
  '46',
  '48',
  '49',
  '4th',
  '4x12',
  '50',
  '500',
  '57',
  '58',
  '5mm',
  '5th',
  '60',
  '600',
  '60mm',
  '62',
  '64',
  '65',
  '6th',
  '70',
  '75',
  '80',
  '802',
  '808',
  '80s',
  '8217',
  '88',
  '90',
  '95',
  '99',
  '9v',
  'aa',
  'aaa',
  'ability',
  'able',
  'ableton',
  'absolute',
  'absolutely',
  'abuse',
  'abused',
  'ac',
  'ac30',
  'accept',
  'acceptable',
  'access',
  'accessible',
  'accessories',
  'accessory',
  'accidentally',
  'accommodate',

In [24]:
display_topics(nmf, cv.get_feature_names(), 5, ['sound', 'instrument'])


Topic:  sound
amp, pedal, sound, like, use

Topic:  instrument
guitar, strings, just, string, like




(NMF(init='nndsvda', n_components=2),
 ['00',
  '000',
  '02',
  '09',
  '10',
  '100',
  '1000',
  '10s',
  '11',
  '12',
  '120',
  '12ax7',
  '13',
  '14',
  '15',
  '150',
  '16',
  '17',
  '18',
  '19',
  '1980',
  '1st',
  '1x12',
  '20',
  '200',
  '2000',
  '2010',
  '2011',
  '2012',
  '2013',
  '2014',
  '21',
  '22',
  '24',
  '25',
  '250',
  '27',
  '28',
  '2i2',
  '2nd',
  '2x',
  '2x12',
  '30',
  '300',
  '32',
  '335',
  '34',
  '35',
  '3rd',
  '40',
  '400',
  '42',
  '45',
  '46',
  '48',
  '49',
  '4th',
  '4x12',
  '50',
  '500',
  '57',
  '58',
  '5mm',
  '5th',
  '60',
  '600',
  '60mm',
  '62',
  '64',
  '65',
  '6th',
  '70',
  '75',
  '80',
  '802',
  '808',
  '80s',
  '8217',
  '88',
  '90',
  '95',
  '99',
  '9v',
  'aa',
  'aaa',
  'ability',
  'able',
  'ableton',
  'absolute',
  'absolutely',
  'abuse',
  'abused',
  'ac',
  'ac30',
  'accept',
  'acceptable',
  'access',
  'accessible',
  'accessories',
  'accessory',
  'accidentally',
  'accommodate',