In [None]:
pip install langdetect

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[K     |████████████████████████████████| 981 kB 5.1 MB/s 
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993241 sha256=65ed6daf97646faae38450859447a597373f2dcd66e1de0d8ea89e7359341909
  Stored in directory: /root/.cache/pip/wheels/c5/96/8a/f90c59ed25d75e50a8c10a1b1c2d4c402e4dacfa87f3aff36a
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [None]:
import pandas as pd
import numpy as np
from copy import deepcopy

import sys
sys.path.append('./utils')
from utils import review_feature
rf = review_feature()

from pandas_profiling import ProfileReport

In [None]:
df = pd.read_csv('train.csv').sort_values(by = ['product'], ignore_index = True)
df

Unnamed: 0,product,answer_option,label
0,Accucheck,Fast and accurate delivery,0
1,Accucheck,Expected a longer expiry date. Your Product Li...,0
2,Accucheck,I liked the prompt service,0
3,Accucheck,Good product,0
4,Accucheck,I not needed,0
...,...,...,...
1671,shampoo,Liked it very nicely working now my scalp is a...,1
1672,shampoo,It's my regular choice,0
1673,shampoo,Works well with my hair oil to decrease dandruff,1
1674,shampoo,It really helps to relieve dandruff and itching,1



**Noun Strength (Rn)**: Nouns are subjects and considered as the most informative part of a language. The amount of subjects shows the importance of review because only a noun describes the prime factors of review (which tells us what the review is about). We did POS Tagging to find nouns in a review and computed score as:
Score(Rn) = TFIDF(noun) / TFIDF(all words)

**Review Polarity (Rp):** Its value lies between -1 to +1 which tells whether a review has sentiment or negative sentiment.

**Review Subjectivity (Rs):** The subjectivity is a measure of the sentiment being objective to subjective and goes from 0 to 1. Objective expressions are facts while Subjective expressions are opinions that describe a person’s feelings. Consider the following expression:
Bournvita tastes very good with milk: Subjective
Bournvita is brown in color: Objective

**Review Complexity (Rc):** To evaluate how good and complex a review is, in terms of unique words within a review and across entire review corpus of a particular product. Rc = Number of unique words in a Review / Number of unique words in entire Corpus

**Review Word Length (Rw):** Word count of a Review



**Compound Score (Rsc):** To improve the efficiency of the system. We compute the compound score using VaderSentimentAnalyser. This library is taken from VADER (Valence Aware Dictionary and sEntiment Reasoner). This is a lexicon and rule-based sentiment analysis tool that is specifically tuned to determine sentiments expressed in social media content. It has the ability to find the sentiment of Slang (e.g. SUX!), Emoji (😩, 😂), Emoticons ( :), :D ) and the difference between capitalized word expressions(I am SAD, I am sad are different expressions).
Rsc ≥ 0.5 (Positive Sentiment)
-0.5<Rsc<+0.5 (Neural Sentiment)
Rsc≤ -0.5 (Negative Sentiment)

In [None]:
df['Rn'] = 0.0
df['Rp'] = 0.0
df['Rs'] = 0.0
df['Rc'] = 0.0

df['Rsc'] = 0.0

In [None]:
df


Unnamed: 0,product,answer_option,label,Rn,Rp,Rs,Rc,Rd,Rsc
0,Accucheck,Fast and accurate delivery,0,0.0,0.0,0.0,0.0,0.0,0.0
1,Accucheck,Expected a longer expiry date. Your Product Li...,0,0.0,0.0,0.0,0.0,0.0,0.0
2,Accucheck,I liked the prompt service,0,0.0,0.0,0.0,0.0,0.0,0.0
3,Accucheck,Good product,0,0.0,0.0,0.0,0.0,0.0,0.0
4,Accucheck,I not needed,0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
1671,shampoo,Liked it very nicely working now my scalp is a...,1,0.0,0.0,0.0,0.0,0.0,0.0
1672,shampoo,It's my regular choice,0,0.0,0.0,0.0,0.0,0.0,0.0
1673,shampoo,Works well with my hair oil to decrease dandruff,1,0.0,0.0,0.0,0.0,0.0,0.0
1674,shampoo,It really helps to relieve dandruff and itching,1,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
product_list = df['product'].unique()
product_list



array(['Accucheck', 'Becadexamin', 'Evion', 'Neurobion',
       'SevenseascodLiverOil', 'Shelcal', 'Supradyn', 'shampoo'],
      dtype=object)

In [None]:
for product in product_list:
    data = df[df['product']==product]
    unique_bag = set()
    for review in data['answer_option']:
        review = review.lower()
        words = review.split()
        unique_bag = unique_bag.union(set(words))
    print(unique_bag)

    for indx in data.index:
        review = data.at[indx, 'answer_option']
        df.at[indx, 'Rp'] = rf.polarity_sentiment(review)
        df.at[indx, 'Rs'] = rf.subjectivi ty_sentiment(review)
        
        df.at[indx, 'Rsc'] = rf.slang_emoji_polarity_compoundscore(review)
        df.at[indx, 'Rc'] = float(len(set(review.split()))) / float(len(unique_bag))

    df.loc[df['product']==product, 'Rn'] = rf.noun_score(data['answer_option'].values).values

{'email', 'torn', 'e5', 'external', 'price', 'utter', 'keep', 'found', 'pathetic', 'a', '120', 'machine', 'right', 'user,', 'clear', 'time.', 'satisfaction.', 'maximum', 'reason', 'requests.', 'trusted', 'reasonable', 'along', 'glucometer', 'returned', 'appreciate...', 'prompt', 'medicine', 'try', 'mentioned', 'simplified', 'recording', 'test', 'courier', 'package', 'prompt.staff', 'refund', 'delivery,packing', 'were', 'in', "wasn't", 'usual.', 'downloaded.intimation', 'getting', '....fast', 'offer.', 'outdated', 'approx', 'communication', 'local', 'delivered.', 'remedy', 'too.', 'item', 'monitoring', 'we', 'genuine', 'supply', 'least', 'paytm,as', 'fine.', 'while', 'strip.', 'going', 'insert', 'yr.', 'delivery', 'active', 'long', 'physiable', 'information', 'little', 'levels', 'sla,', 'energy.', 'market', 'box.', 'packet.', 'year', 'either', 'work', 'comfortable', 'contains', 'promt', 'validity.', 'seconds.', 'division', 'online.', 'facilities', 'to.use', 'rs', 'high.', 'too', 'recent

In [None]:
df

Unnamed: 0,product,answer_option,label,Rn,Rp,Rs,Rc,Rd,Rsc
0,Accucheck,Fast and accurate delivery,0,0.234180,0.30,0.616667,0.004854,0.0,0.0000
1,Accucheck,Expected a longer expiry date. Your Product Li...,0,0.399916,-0.10,0.400000,0.015777,0.0,0.0000
2,Accucheck,I liked the prompt service,0,0.319806,0.60,0.800000,0.006068,0.0,0.4215
3,Accucheck,Good product,0,0.546220,0.70,0.600000,0.002427,0.0,0.4404
4,Accucheck,I not needed,0,0.000000,0.00,0.000000,0.003641,0.0,0.0000
...,...,...,...,...,...,...,...,...,...
1671,shampoo,Liked it very nicely working now my scalp is a...,1,0.165386,0.69,0.900000,0.024609,0.0,0.5709
1672,shampoo,It's my regular choice,0,0.500000,0.00,0.076923,0.008949,0.0,0.0000
1673,shampoo,Works well with my hair oil to decrease dandruff,1,0.565045,0.00,0.000000,0.020134,0.0,0.2732
1674,shampoo,It really helps to relieve dandruff and itching,1,0.134413,0.20,0.200000,0.017897,0.0,0.6865
