# KeyPhase and Entity Extraction

In [1]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import spacy
from sklearn.model_selection import StratifiedKFold
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn import set_config

tqdm.pandas()
nlp = spacy.load('en_core_web_lg')
set_config(display="diagram")

In [2]:
df = pd.read_csv('../input/amazon-kindle-book-review-for-sentiment-analysis/all_kindle_review .csv', usecols=['reviewText', 'rating']).fillna('')
print(df.shape)
df.head()

(12000, 2)


Unnamed: 0,rating,reviewText
0,3,"Jace Rankin may be short, but he's nothing to ..."
1,5,Great short read. I didn't want to put it dow...
2,3,I'll start by saying this is the first of four...
3,3,Aggie is Angela Lansbury who carries pocketboo...
4,4,I did not expect this type of book to be in li...


In [3]:
class EntityExtractor(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return [' '.join([ent.text for ent in nlp(x).ents]) for x in tqdm(X)]
    
    
class KeyPhaseExtractor(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def __noun_verb_noun(self, doc):
            phrases = []
            for token in doc:
                if (token.pos_=='VERB'):
                    phrase = ''
                    for sub_tok in token.lefts:
                        if (sub_tok.dep_ in ['nsubj','nsubjpass']) and (sub_tok.pos_ in ['NOUN','PROPN','PRON']):
                            phrase += sub_tok.text
                            phrase += ' '+token.lemma_ 
                            for sub_tok in token.rights:
                                if (sub_tok.dep_ in ['dobj']) and (sub_tok.pos_ in ['NOUN','PROPN']):       
                                    phrase += ' '+sub_tok.text
                                    phrases.append(phrase)

            return phrases
        
    def __adjective_noun(self, doc):
        phrases = []

        for token in doc:
            phrase = ''
            if (token.pos_ == 'NOUN') and (token.dep_ in ['dobj','pobj','nsubj','nsubjpass']):
                for subtoken in token.children:
                    if (subtoken.pos_ == 'ADJ') or (subtoken.dep_ == 'compound'):
                        phrase += subtoken.text + ' '
                if len(phrase)!=0:
                    phrase += token.text 
            if  len(phrase)!=0:
                phrases.append(phrase)
        return phrases      
    
    def __prepositions(self, doc):
        phrases = []

        for token in doc:
            if token.pos_=='ADP':
                phrase = ''
                if token.head.pos_=='NOUN':
                    phrase += token.head.text
                    phrase += ' '+token.text
                    for right_tok in token.rights:
                        if (right_tok.pos_ in ['NOUN','PROPN']):
                            phrase += ' '+right_tok.text

                    if len(phrase)>2:
                        phrases.append(phrase)

        return phrases

    def transform(self, X, y=None):
        return [(self.__noun_verb_noun(nlp(x.lower())) + self.__adjective_noun(nlp(x.lower())) + self.__prepositions(nlp(x.lower()))) for x in tqdm(X)]

In [4]:
get_model = lambda: Pipeline([
    ('vectorizer', FeatureUnion([
        ('entities', Pipeline([('extractor', EntityExtractor()), ('vectorizer', TfidfVectorizer(use_idf=False, sublinear_tf=True))])),
        ('kphrases', Pipeline([('extractor', KeyPhaseExtractor()), ('vectorizer', TfidfVectorizer(tokenizer=lambda x: x, sublinear_tf=True, lowercase=False))])),
    ])),
    ('estimator', LogisticRegression(max_iter=100_000, random_state=19))
])
get_model()

In [5]:
x = df['reviewText']
y = df['rating']

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=19)
scores = []
for train_index, valid_index in tqdm(skf.split(x, y), total=10):
    x_train, x_valid = x.iloc[train_index], x.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
    model = get_model().fit(x_train, y_train)
    scores.append(model.score(x_valid, y_valid))
print(np.mean(scores))

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10800 [00:00<?, ?it/s]

  0%|          | 0/10800 [00:00<?, ?it/s]

  0%|          | 0/1200 [00:00<?, ?it/s]

  0%|          | 0/1200 [00:00<?, ?it/s]

  0%|          | 0/10800 [00:00<?, ?it/s]

  0%|          | 0/10800 [00:00<?, ?it/s]

  0%|          | 0/1200 [00:00<?, ?it/s]

  0%|          | 0/1200 [00:00<?, ?it/s]

  0%|          | 0/10800 [00:00<?, ?it/s]

  0%|          | 0/10800 [00:00<?, ?it/s]

  0%|          | 0/1200 [00:00<?, ?it/s]

  0%|          | 0/1200 [00:00<?, ?it/s]

  0%|          | 0/10800 [00:00<?, ?it/s]

  0%|          | 0/10800 [00:00<?, ?it/s]

  0%|          | 0/1200 [00:00<?, ?it/s]

  0%|          | 0/1200 [00:00<?, ?it/s]

  0%|          | 0/10800 [00:00<?, ?it/s]

  0%|          | 0/10800 [00:00<?, ?it/s]

  0%|          | 0/1200 [00:00<?, ?it/s]

  0%|          | 0/1200 [00:00<?, ?it/s]

  0%|          | 0/10800 [00:00<?, ?it/s]

  0%|          | 0/10800 [00:00<?, ?it/s]

  0%|          | 0/1200 [00:00<?, ?it/s]

  0%|          | 0/1200 [00:00<?, ?it/s]

  0%|          | 0/10800 [00:00<?, ?it/s]

  0%|          | 0/10800 [00:00<?, ?it/s]

  0%|          | 0/1200 [00:00<?, ?it/s]

  0%|          | 0/1200 [00:00<?, ?it/s]

  0%|          | 0/10800 [00:00<?, ?it/s]

  0%|          | 0/10800 [00:00<?, ?it/s]

  0%|          | 0/1200 [00:00<?, ?it/s]

  0%|          | 0/1200 [00:00<?, ?it/s]

  0%|          | 0/10800 [00:00<?, ?it/s]

  0%|          | 0/10800 [00:00<?, ?it/s]

  0%|          | 0/1200 [00:00<?, ?it/s]

  0%|          | 0/1200 [00:00<?, ?it/s]

  0%|          | 0/10800 [00:00<?, ?it/s]

  0%|          | 0/10800 [00:00<?, ?it/s]

  0%|          | 0/1200 [00:00<?, ?it/s]

  0%|          | 0/1200 [00:00<?, ?it/s]

0.37124999999999997
