# Significant Collocations

In [1]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from sklearn.model_selection import StratifiedKFold
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn import set_config

from nltk.collocations import QuadgramCollocationFinder
from nltk.metrics.association import QuadgramAssocMeasures

tqdm.pandas()
set_config(display="diagram")

In [2]:
df = pd.read_csv('../input/amazon-kindle-book-review-for-sentiment-analysis/all_kindle_review .csv', usecols=['reviewText', 'rating']).fillna('')
print(df.shape)
df.head()

(12000, 2)


Unnamed: 0,rating,reviewText
0,3,"Jace Rankin may be short, but he's nothing to ..."
1,5,Great short read. I didn't want to put it dow...
2,3,I'll start by saying this is the first of four...
3,3,Aggie is Angela Lansbury who carries pocketboo...
4,4,I did not expect this type of book to be in li...


In [3]:
class SignificantCollocations(BaseEstimator, TransformerMixin):
    def __init__(self, ngram_class=QuadgramCollocationFinder, metric=QuadgramAssocMeasures.pmi):
        self.ngram_class = ngram_class
        self.metric = metric
    
    def fit(self, X, y=None):
        ngrams = self.ngram_class.from_documents(X)
        self.scored_ = dict(ngrams.score_ngrams(self.metric))
        return self
    
    def transform(self, X, y=None):
        for x in tqdm(X):
            ngrams = self.ngram_class.from_words(x)
            yield {
                ngram: self.scored_.get(ngram, 0.0) for ngram in ngrams.nbest(QuadgramAssocMeasures.raw_freq, 50)
            }
    

In [4]:
get_model = lambda: Pipeline([
    ('vectorizer', Pipeline([('extractor', SignificantCollocations()), ('vectorizer', DictVectorizer())])),
    ('estimator', LogisticRegression(max_iter=100_000, random_state=19, class_weight="balanced"))
])
get_model()

In [5]:
x = df['reviewText']
y = df['rating']

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=19)
scores = []
for train_index, valid_index in tqdm(skf.split(x, y), total=10):
    x_train, x_valid = x.iloc[train_index], x.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
    model = get_model().fit(x_train, y_train)
    scores.append(model.score(x_valid, y_valid))
print(np.mean(scores))

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10800 [00:00<?, ?it/s]

  0%|          | 0/1200 [00:00<?, ?it/s]

  0%|          | 0/10800 [00:00<?, ?it/s]

  0%|          | 0/1200 [00:00<?, ?it/s]

  0%|          | 0/10800 [00:00<?, ?it/s]

  0%|          | 0/1200 [00:00<?, ?it/s]

  0%|          | 0/10800 [00:00<?, ?it/s]

  0%|          | 0/1200 [00:00<?, ?it/s]

  0%|          | 0/10800 [00:00<?, ?it/s]

  0%|          | 0/1200 [00:00<?, ?it/s]

  0%|          | 0/10800 [00:00<?, ?it/s]

  0%|          | 0/1200 [00:00<?, ?it/s]

  0%|          | 0/10800 [00:00<?, ?it/s]

  0%|          | 0/1200 [00:00<?, ?it/s]

  0%|          | 0/10800 [00:00<?, ?it/s]

  0%|          | 0/1200 [00:00<?, ?it/s]

  0%|          | 0/10800 [00:00<?, ?it/s]

  0%|          | 0/1200 [00:00<?, ?it/s]

  0%|          | 0/10800 [00:00<?, ?it/s]

  0%|          | 0/1200 [00:00<?, ?it/s]

0.3744166666666667
