In [None]:
# import sys
import argparse
from functools import partial

import nltk
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk import ngrams as nltk_ngrams
nltk.download('punkt')

In [None]:
text = 'The latent Dirichlet allocation (LDA) is a generative statistical model that allows sets of observations.'
words = word_tokenize(text)

In [None]:
def ngrams(words, n=2): 
    for idx in range(len(words)-n+1):  #[w0, w1, w2, w3][2:4] -> [w0, w1], [w1, w2], [w2, w3]
        yield tuple(words[idx:idx+n])

LPAD_SYMBOL = "<s>"
RPAD_SYMBOL = "</s>"

nltk_ngrams = partial(nltk_ngrams,
    pad_right=True, pad_left=True,
    right_pad_symbol=RPAD_SYMBOL, left_pad_symbol=LPAD_SYMBOL
    )

def ngrams2(text, n=2):
    for sent in sent_tokenize(text):
        sent = word_tokenize(sent)
        for ngram in nltk_ngrams(sent, n):
            yield ngram
            
def ngrams3(words, n=2):
        for ngram in nltk_ngrams(words, n):
            yield ngram            

In [None]:
for ngram in ngrams(words, n=2):
    print(ngram)

In [None]:
for ngram in ngrams2(words, n=2):
    print(ngram)

In [None]:
for ngram in ngrams2(text, n=3):
    print(ngram)

In [None]:
for ngram in ngrams3(words, n=3):
    print(ngram)

In [8]:
import nltk
import json
import pickle

from nltk.tokenize import WordPunctTokenizer

from nltk.corpus.reader.api import CorpusReader
from nltk.corpus.reader.api import CategorizedCorpusReader

DOC_PATTERN = r'(?!\.)[a-z_\s]+/[a-f0-9]+\.json'
PKL_PATTERN = r'(?!\.)[a-z_\s]+/[a-f0-9]+\.pickle'
CAT_PATTERN = r'([a-z_\s]+)/.*'


class PickledCorpusReader(CategorizedCorpusReader, CorpusReader):

    def __init__(self, root, fileids=PKL_PATTERN, **kwargs):
        """
        Initialize the corpus reader.  Categorization arguments
        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
        the ``CategorizedCorpusReader`` constructor.  The remaining arguments
        are passed to the ``CorpusReader`` constructor.
        """
        # Add the default category pattern if not passed into the class.
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = CAT_PATTERN

        CategorizedCorpusReader.__init__(self, kwargs)
        CorpusReader.__init__(self, root, fileids)

        self._word_tokenizer = WordPunctTokenizer()
        self._sent_tokenizer = nltk.data.LazyLoader(
            'tokenizers/punkt/english.pickle')

    def _resolve(self, fileids, categories):
        """
        Returns a list of fileids or categories depending on what is passed
        to each internal corpus reader function. This primarily bubbles up to
        the high level ``docs`` method, but is implemented here similar to
        the nltk ``CategorizedPlaintextCorpusReader``.
        """
        if fileids is not None and categories is not None:
            raise ValueError("Specify fileids or categories, not both")

        if categories is not None:
            return self.fileids(categories)
        return fileids

    def feeds(self):
        data = self.open('feeds.json')
        return json.load(data)

    def docs(self, fileids=None, categories=None):
        """
        Returns the document loaded from a pickled object for every file in
        the corpus. Similar to the BaleenCorpusReader, this uses a generator
        to acheive memory safe iteration.
        """
        # Resolve the fileids and the categories
        fileids = self._resolve(fileids, categories)

        # Create a generator, loading one document into memory at a time.
        for path, enc, fileid in self.abspaths(fileids, True, True):
            with open(path, 'rb') as f:
                yield pickle.load(f)

    def paras(self, fileids=None, categories=None):
        """
        Returns a generator of paragraphs where each paragraph is a list of
        sentences, which is in turn a list of (token, tag) tuples.
        """
        for doc in self.docs(fileids, categories):
            for paragraph in doc:
                yield paragraph

    def sents(self, fileids=None, categories=None):
        """
        Returns a generator of sentences where each sentence is a list of
        (token, tag) tuples.
        """
        for paragraph in self.paras(fileids, categories):
            for sentence in paragraph:
                yield sentence

    def tagged(self, fileids=None, categories=None):
        for sent in self.sents(fileids, categories):
            for token in sent:
                yield token

    def words(self, fileids=None, categories=None):
        """
        Returns a generator of (token, tag) tuples.
        """
        for sentence in self.sents(fileids, categories):
            for token, _ in sentence:
                yield token

    def describe(self, fileids=None, categories=None):
        """
        Performs a single pass of the corpus and
        returns a dictionary with a variety of metrics
        concerning the state of the corpus.
        """
        # Structures to perform counting.
        counts  = nltk.FreqDist()
        tokens  = nltk.FreqDist()

        # Perform single pass over paragraphs, tokenize and count
        for para in self.paras(fileids, categories):
            for sent in para:
                for word, tag in sent:
                    counts['words'] += 1 # total word count
                    tokens[word] += 1 # incidents of each word

        # Return data structure with information
        return {
            'words':  counts['words'], # word count
            'vocab':  len(tokens), # unique word count / token count
            'lexdiv': float(counts['words']) / float(len(tokens)), 
        }

if __name__ == '__main__':
    from collections import Counter

    corpus = PickledCorpusReader('corpus')
    words = Counter(corpus.words())

    print("{:,} vocabulary {:,} word count".format(
        len(words.keys()), sum(words.values())
        )
    )

58,748 vocabulary 1,624,862 word count


# grammar

In [39]:
import re
import nltk

GRAMMAR = """
    S -> NNP VP
    VP -> V PP
    PP -> P NP
    NP -> DT N
    NNP -> 'Gwen' | 'George'
    V -> 'looks' | 'burns'
    P -> 'in' | 'for'
    DT -> 'the'
    N -> 'castle' | 'ocean'
    """

cfg = nltk.CFG.fromstring(GRAMMAR)

print(cfg)
print(cfg.start())
print(cfg.productions())

Grammar with 13 productions (start state = S)
    S -> NNP VP
    VP -> V PP
    PP -> P NP
    NP -> DT N
    NNP -> 'Gwen'
    NNP -> 'George'
    V -> 'looks'
    V -> 'burns'
    P -> 'in'
    P -> 'for'
    DT -> 'the'
    N -> 'castle'
    N -> 'ocean'
S
[S -> NNP VP, VP -> V PP, PP -> P NP, NP -> DT N, NNP -> 'Gwen', NNP -> 'George', V -> 'looks', V -> 'burns', P -> 'in', P -> 'for', DT -> 'the', N -> 'castle', N -> 'ocean']


# collocation

In [6]:
from operator import itemgetter
from collections import defaultdict

from nltk.collocations import QuadgramCollocationFinder
from nltk.metrics.association import QuadgramAssocMeasures

In [7]:
def rank_quadgrams(corpus, metric, path=None):
    """
    Find and rank quadgrams from the supplied corpus using the given
    association metric. Write the quadgrams out to the given path if
    supplied otherwise return the list in memory.
    """

    # Create a collocation ranking utility from corpus words.
    ngrams = QuadgramCollocationFinder.from_words(corpus.words())

    # Rank collocations by an association metric
    scored = ngrams.score_ngrams(metric)

    if path:
        with open(path, 'w') as f:
            f.write("Collocation\tScore ({})\n".format(metric.__name__))
            for ngram, score in scored:
                f.write("{}\t{}\n".format(repr(ngram), score))
    else:
        return scored

In [None]:


corpus = PickledCorpusReader('corpus_small')
rank_quadrams_small = rank_quadgrams(corpus, QuadgramAssocMeasures)



    # # Group quadgrams by first word
    # prefixed = defaultdict(list)
    # for key, score in scored:
    #     prefixed[key[0]].append((key[1:], scores))
    #
    # # Sort keyed quadgrams by strongest association
    # for key in prefixed:
    #     prefixed[key].sort(key=itemgetter(1), reverse=True)

In [None]:
rank_quadrams_small[1: 10]

In [None]:
rank_quadrams_small[-10:-1]

# transformers

In [29]:
from nltk import ne_chunk
from itertools import groupby
from nltk.corpus import wordnet as wn
from nltk.chunk import tree2conlltags
from nltk.probability import FreqDist
from nltk.chunk.regexp import RegexpParser
from unicodedata import category as unicat
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.base import BaseEstimator, TransformerMixin

In [30]:
GRAMMAR = r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'
GOODTAGS = frozenset(['JJ','JJR','JJS','NN','NNP','NNS','NNPS'])
GOODLABELS = frozenset(['PERSON', 'ORGANIZATION', 'FACILITY', 'GPE', 'GSP'])

In [31]:
class KeyphraseExtractor(BaseEstimator, TransformerMixin):
    """
    Wraps a PickledCorpusReader consisting of pos-tagged documents.
    """
    def __init__(self, grammar=GRAMMAR):
        self.grammar = GRAMMAR
        self.chunker = RegexpParser(self.grammar)

    def normalize(self, sent):
        """
        Removes punctuation from a tokenized/tagged sentence and
        lowercases words.
        """
        is_punct = lambda word: all(unicat(char).startswith('P') for char in word)
        sent = filter(lambda t: not is_punct(t[0]), sent)
        sent = map(lambda t: (t[0].lower(), t[1]), sent)
        return list(sent)

    def extract_keyphrases(self, document):
        """
        For a document, parse sentences using our chunker created by
        our grammar, converting the parse tree into a tagged sequence.
        Yields extracted phrases.
        """
        for sents in document:
            for sent in sents:
                sent = self.normalize(sent)
                if not sent: continue
                chunks = tree2conlltags(self.chunker.parse(sent))
                phrases = [ " ".join(word for word, pos, chunk in group).lower() 
                # {true: group1, false:group2}
                for key, group in groupby(chunks, lambda term: term[-1] != 'O')  # groupby(input, key)
                if key
                ]                
                for phrase in phrases:
                    yield phrase

    def fit(self, documents, y=None):
        return self

    def transform(self, documents):
        for document in documents:
            yield list(self.extract_keyphrases(document))


class EntityExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, labels=GOODLABELS, **kwargs):
        self.labels = labels

    def get_entities(self, document):
        entities = []
        for paragraph in document:
            for sentence in paragraph:
                trees = ne_chunk(sentence)
                for tree in trees:
                    if hasattr(tree, 'label'):
                        if tree.label() in self.labels:
                            entities.append(
                                ' '.join([child[0].lower() for child in tree]) # child[0] : 단어
                                )
        return entities

    def fit(self, documents, labels=None):
        return self

    def transform(self, documents):
        for document in documents:
            yield self.get_entities(document)



    

In [None]:
nltk.download('maxent_ne_chunker')
nltk.download('words')
corpus = PickledCorpusReader('corpus')
docs = corpus.docs()

# phrase_extractor = KeyphraseExtractor()
# keyphrases = list(phrase_extractor.fit_transform(docs))
# print(keyphrases[0])

# entity_extractor = EntityExtractor()
# entities = list(entity_extractor.fit_transform(docs))
# print(entities[0])

# Model

In [9]:
import nltk

from math import log
from collections import Counter, defaultdict

from nltk.util import ngrams
from nltk.probability import ProbDistI, FreqDist, ConditionalFreqDist

In [43]:

class NgramCounter(object): #NgramCounter(2, vocab)
    """
    The NgramCounter class counts ngrams given a vocabulary and ngram size.
    """

    def __init__(self, n, vocabulary, unknown="<UNK>"):
        """
        n is the size of the ngram
        변수들을 초기화하는 역할. Constructor에 input(vocabulary, unknown)을 넣어주면 그 값에 맞춰서 변수의 값이 설정된다. 
        """
        if n < 1:
            raise ValueError("ngram size must be greater than or equal to 1")

        self.n = n
        self.unknown = unknown
        self.padding = {
            "pad_left": True,
            "pad_right": True,
            "left_pad_symbol": "<s>",
            "right_pad_symbol": "</s>"
        }

        self.vocabulary = vocabulary
        self.allgrams = defaultdict(ConditionalFreqDist) # dictionary의 기본 value 형태가 Conditional FreqDist 형태로 설정
        self.ngrams = FreqDist() 
        self.unigrams = FreqDist() 

        
    
    def train_counts(self, training_text):
        
        [['Hello', 'Word'], ["Life", 'is', 'short']]
        
        for sent in training_text:
            
            #sent 내부의 word가 self.vocabulary에 존재하는 단어인지 확인 후 있으면 단어를, 아니면 self.unknown 을 저장

        
            checked_sent = (self.check_against_vocab(word) for word in sent)  #vocab check
            
            
            sent_start = True # unigram counting에서 한 단어가 여러번 세지는거 방지
            
            # [w1, w2, w3]
            for ngram in self.to_ngrams(checked_sent): # checked_sent 로부터 ngram list을 생성
                

                self.ngrams[ngram] += 1 # 각 ngram의 개수를 count
                # (w1, w2) , w3
                context, word = tuple(ngram[:-1]), ngram[-1] # ngram을 분리: 맨 마지막 단어가 word 가 되고 그 앞의 단어들은 모두 context
                
                
                # unigram counting
                if sent_start: # for loop가 돌 때마다 unigram counting이 일어나는 것을 방지
                    for context_word in context:
                        self.unigrams[context_word] += 1 
                    sent_start = False
                    
                # conditional frequency
                # w1, w2 
                for window, ngram_order in enumerate(range(self.n, 1, -1)): # [3, 2] -> (0, 3), (1, 2)
                    context = context[window:] 
                    self.allgrams[ngram_order][context][word] += 1  #conditional frequency counting
                self.unigrams[word] += 1 
                
                

    def check_against_vocab(self, word):
        if word in self.vocabulary:
            return word
        return self.unknown

    def to_ngrams(self, sequence):
        """
        Wrapper for NLTK ngrams method
        """
        return ngrams(sequence, self.n, **self.padding) # 앞에 **이 붙는 것은 keyword argument 형태로 변수를 넣어주겠다는 것! 주로 여러개의 변수를 한번에 넣을 때 사용하는 방식


def count_ngrams(n, vocabulary, texts):
    counter = NgramCounter(n, vocabulary) # Constructor를 이용해 instance 생성, 이 때 __init__ 함수가 실행된다. -> unigram/ngrams/allgrams 설정 
    counter.train_counts(texts) # texts를 이용해서 unigram/ngrams/allgrams count
    return counter # UNIGRAM/NGRAM/ALLGRAMS(conditional)




corpus = PickledCorpusReader('corpus')
# tokens = [''.join(word[0]) for word in corpus.words()]
tokens = list(corpus.words())
vocab = Counter(tokens)
sents = list([word[0] for word in sent] for sent in corpus.sents())
trigram_counts = count_ngrams(3, vocab, sents) # UNIGRAM/NGRAM/ALLGRAMS(conditional)


In [46]:
print(trigram_counts.unigrams) 
print(trigram_counts.ngrams)
# print(trigram_counts.allgrams[3]) 
# print(sorted(trigram_counts.allgrams[3].conditions())) CondFreqDist
print(list(trigram_counts.allgrams[3][('the', 'President')]))

<FreqDist with 58750 samples and 1924458 outcomes>
<FreqDist with 1151450 samples and 1774660 outcomes>
["'", 'Source', 'and', 'nominates', 'in', 'as', 'said', 'is', 'who', 'that', '.', 'are', 'called', 'announced', 'isn', '’', 'for', 'would', 'directly']


In [24]:
corpus = PickledCorpusReader('corpus_small')

class BaseNgramModel(object):
    """
    The BaseNgramModel creates an n-gram language model.
    This base model is equivalent to a Maximum Likelihood Estimation.
    """

    def __init__(self, ngram_counter):
        """
        BaseNgramModel is initialized with an NgramCounter.
        """
        self.n = ngram_counter.n 
        self.ngram_counter = ngram_counter
        self.ngrams = ngram_counter.ngrams
        self._check_against_vocab = self.ngram_counter.check_against_vocab

    def check_context(self, context):
        """
        Ensures that the context is not longer than or equal to the model's
        n-gram order.
        Returns the context as a tuple.
        """
        if len(context) >= self.n:
            raise ValueError("Context too long for this n-gram")

        return tuple(context)

    def score(self, word, context):
        """
        For a given string representation of a word, and a string word context,
        returns the maximum likelihood score that the word will follow the
        context.
        """
        context = self.check_context(context)

        return self.ngrams[context].freq(word) # (context 이후에 word 가 나오는 frequency) / (전체 data에서 word의 frequency)

    def logscore(self, word, context):
        """
        For a given string representation of a word, and a word context,
        computes the log probability of this word in this context.
        
        """
        score = self.score(word, context)
        if score == 0.0:
            return float("-inf")

        return log(score, 2)

    def entropy(self, text):
        """
        Calculate the approximate cross-entropy of the n-gram model for a
        given text represented as a list of comma-separated strings.
        This is the average log probability of each word in the text.
        """
        # entropy 참고: https://velog.io/@hojp7874/교차-엔트로피
        
        normed_text = (self._check_against_vocab(word) for word in text) # vocab check
        entropy = 0.0
        processed_ngrams = 0
        for ngram in self.ngram_counter.to_ngrams(normed_text): # normed text로부터 ngram 목록을 생성
            context, word = tuple(ngram[:-1]), ngram[-1] # 각 ngram을 context와 ngram으로 분리
            entropy += self.logscore(word, context) 
            processed_ngrams += 1 
            
        # entropy: logscore의 합 / processed_ngrams: 식에서 n
        return - (entropy / processed_ngrams) # 모든 Ngram의 word에 대해 가능한 평균 entropy

    def perplexity(self, text):
        """
        Given list of comma-separated strings, calculates the perplexity
        of the text.
        """
        return pow(2.0, self.entropy(text))

    
# context + word가 기존 corpus에 존재하지 않을 수 있다! 그 경우를 해결하기 위한 것이 Add-K method
# Add-K method의 경우 기존에 train data set에 없는 단어가 등장 시 이전에 K 번 등장했다고 생각하고 frequency를 K로 두게 된다
# 이 중에서도 K=1 인 경우가 바로 add-one, Laplace smoothing

class AddKNgramModel(BaseNgramModel):
    """
    Provides Add-k-smoothed scores.
    """

    def __init__(self, k, *args):
        """
        Expects an input value, k, a number by which
        to increment word counts during scoring.
        """
        super(AddKNgramModel, self).__init__(*args)

        self.k = k
        self.k_norm = len(self.ngram_counter.vocabulary) * k
        
    
    # 기존에 score를 계산하던 방식을 add - K 방식에 맞춰 덮어씌운다
    # 기존 방식: (context 이후에 word 가 나오는 frequency) / (전체 data에서 word의 frequenc
    def score(self, word, context):
        """
        With Add-k-smoothing, the score is normalized with
        a k value.
        """
        context = self.check_context(context) # ngram의 길이보다 context가 항상 짧도록 검사
        context_freqdist = self.ngrams[context] # context의 등장 횟수
        word_count = context_freqdist[word] # context + word의 등장 횟수 : conditional frequency
        context_count = context_freqdist.N() 
        return (word_count + self.k) / (context_count + self.k_norm)


class LaplaceNgramModel(AddKNgramModel):
    """
    Implements Laplace (add one) smoothing.
    Laplace smoothing is the base case of Add-k smoothing,
    with k set to 1.
    """
    def __init__(self, *args):
        super(LaplaceNgramModel, self).__init__(1, *args)
        
        
# BaseNgramModel -> AddK -> Laplace : AddK에 override 된 score 사용
# BaseNgramModel -> KneserNeyModel : KneserNey에 override 된 score 사용



class KneserNeyModel(BaseNgramModel):
    """
    Implements Kneser-Ney smoothing
    """
    def __init__(self, *args):
        super(KneserNeyModel, self).__init__(*args) # counter 받아서 unigram/ngram/allgrams 생성
        self.model = nltk.KneserNeyProbDist(self.ngrams)

    def score(self, word, context):
        """
        Use KneserNeyProbDist from NLTK to get score
        """
        trigram = tuple((context[0], context[1], word))
        return self.model.prob(trigram)

    def samples(self):
        return self.model.samples()

    def prob(self, sample):
        return self.model.prob(sample)

In [25]:
corpus = PickledCorpusReader('corpus_small')
tokens = [''.join(word) for word in corpus.words()] 
vocab = Counter(tokens) # Counter: python 기본 제공 함수
sents = list([word[0] for word in sent] for sent in corpus.sents()) # tag 제거, word[0]이 실제 단어!

counter = count_ngrams(3, vocab, sents) #BaseNgramCounter instance
knm = KneserNeyModel(counter)


def complete(input_text):
    tokenized = nltk.word_tokenize(input_text) # [The, President, of, the, United]
    if len(tokenized) < 2:
        response = "Say more."
    else:
        completions = {}
        
        # 가능한 trigram sample 중에서 단어가 겹치는 것에 대해서 완성할 수 있는 단어 candidate : probability 딕셔너리 만들기
        for sample in knm.samples(): # knm.samples() : trained data의 prob > 0 인 모든 trigram
            if (sample[0], sample[1]) == (tokenized[-2], tokenized[-1]): # sample trigram 앞 두 단어 == input text의 마지막 두 단어
                completions[sample[2]] = knm.prob(sample) # sample trigram의 probability
        if len(completions) == 0:
            response = "Can we talk about something else?"
        else:
            # dictionary 에서 가장 확률이 높은 candidate 찾기
            best = max(
                completions.keys(), key=(lambda key: completions[key]) # completions.keys() : candidates list
            )
            tokenized += [best] # tokenized list에 best 단어를 추가
            response = " ".join(tokenized)

    return response

print(complete("The President of the United"))
print(complete("This election year will"))

The President of the United States
This election year will mark


# Ngram Language Models 
## Backgrounds
-----
### Significant Collocations
#### nltk에서 significant collocations를 찾기 위해 사용하는 도구들
* `CollocationFinder` :  collect collocation candidate frequencies, filter and rank them
* `NgramAssocMeasuers`: generic association measures. Each public method returns a score.(Available methods : chi squre, jaccard, liklihood_ratio, mi_like, pmi, poisson_stirling, raw_freq, student_t...)

#### `rank_quadgrams`
* `ngrams = QuadgramCollocationFinder.from_words(corpus.words())`
	* `corpus.words()` -> corpus 내부의 Text가 word tokenization이 끝난 형태로 return
    * `from_words`: `QuadgramCollocationFinder` class 내부의 함수. Construct a QuadgramCollocationFinder for n-grams(n<4) in the given sequence. 
    * `scored = ngrams.score_ngrams(metric)` : `metric`은 `QuadgramAssocMeasures`에서 제공하는 방법 중 하나. `metric`을 기준으로 더 중요한 ngram이 무엇인지 저장되어 `scored`에 저장된다. 
    
``` 
# Full code
def rank_quadgrams(corpus, metric, path=None):
    """
    Find and rank quadgrams from the supplied corpus using the given
    association metric. Write the quadgrams out to the given path if
    supplied otherwise return the list in memory.
    """

    # Create a collocation ranking utility from corpus words.
    ngrams = QuadgramCollocationFinder.from_words(corpus.words())

    # Rank collocations by an association metric
    scored = ngrams.score_ngrams(metric)

    if path:
        with open(path, 'w') as f:
            f.write("Collocation\tScore ({})\n".format(metric.__name__))
            for ngram, score in scored:
                f.write("{}\t{}\n".format(repr(ngram), score))
    else:
        return scored
```
따라서 
```
rank_quadgrams(
        corpus, QuadgramAssocMeasures.likelihood_ratio, "quadgrams.txt"
    )
```
와 같이 함수를 실행시키면 corpus의 quadgram들이 liklihood_ratio를 기준으로 정렬된 후 "quadgrams.txt" 파일에 순위 순으로 저장된다. 


#### `SignificantCollocations`
* `BaseEstimator, TransformerMixin` : `sklearn` 라이브러리의 기본 클래스. preprocessing 과정에서 pipeline을 커스텀하기 위하여 사용한다. (참고: [BaseEstimator in sklearn.base](https://stackoverflow.com/questions/15233632/baseestimator-in-sklearn-base-python), [SCIKIT LEARN 전처리를 위한 변환기 만들기](https://databuzz-team.github.io/2018/11/11/make_pipeline/)
* `def fit(self, docs, target)` : 문서 형태의 `docs`를 input으로 받아 ngram들을 형성하고 `ngrams`에 저장한다. 그리고 `self._scored_`에 `self.metric`을 기준으로 정렬된 significant collocations(ngrams)를 `dict` 형태로 저장한다. 
* `def transformation(self, docs, target)` : raw_freq가 높은 상위 50개의 ngram에 대하여 ngram과 그 score(fit method 에서 구한 점수)를 dict 형태로 저장한다.

   
      

