## Importing libraries

In [1]:
import pandas as pd
import numpy as np
import os
import spacy
from tqdm import tqdm

### Read reviews data

In [2]:
# con=open("../data/Samsung.txt",'r', encoding="utf-8")
con=open("Samsung.txt",'r', encoding="utf-8")
samsung_reviews=con.read()
con.close()

### Can we reduce the time taken?
[Pipelines (Spacy)](https://spacy.io/usage/processing-pipelines)


<img src='./images/spacy_pipeline.png'>

In [3]:
# shorten the pipline loading
nlp=spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [5]:
nouns = []
for review in tqdm(samsung_reviews.split("\n")[0:1000]):
  doc = nlp(review)
  for tok in doc:
    if tok.pos_ == "NOUN":
      nouns.append(tok.lemma_.lower())

100%|██████████| 1000/1000 [00:07<00:00, 135.66it/s]


In [6]:
len(samsung_reviews.split("\n"))

46355

In [7]:
(46355/1000)*6

278.13

In [8]:
278/60

4.633333333333334

### Lets process all the reviews now and see if time taken is less !!!

In [9]:
nouns = []
for review in tqdm(samsung_reviews.split("\n")):
  doc = nlp(review)
  for tok in doc:
    if tok.pos_ == "NOUN":
      nouns.append(tok.lemma_.lower())

100%|██████████| 46355/46355 [04:37<00:00, 167.03it/s]


### Does the hypothesis of nouns capturing `product features` hold?

In [11]:
nouns = pd.Series(nouns)
nouns.value_counts().head(5)

Unnamed: 0,count
phone,43507
battery,4334
product,3992
screen,3838
time,3810


In [12]:
nouns.value_counts().head(10)

Unnamed: 0,count
phone,43507
battery,4334
product,3992
screen,3838
time,3810
card,3384
price,3149
problem,3137
camera,2936
app,2593


### We now know that people mention `battery`, `product`, `screen` etc. But we still don't know in what context they mention these keywords

### Summary:
 - Most frequently used lemmatised forms of noun, inform us about the product features people are talking about in product reviews
 - In order to process the review data faster spacy allows us to use the idea of enabling parts of model inference pipeline via `spacy.loads()` command and `disable` parameter