##문제1. Tokenizer

In [1]:
import re
word_dict = {}

class Tokenizer():
  def __init__(self):
    self.word_dict = {'oov': 0}
    self.fit_checker = False

  def preprocessing(self, sequences):
    sequences_processed = []

    for item in sequences:
      item = item.strip().lower()
      item = re.sub("[^a-z0-9 ]", "", item)
      tokens = item.split(' ')
      sequences_processed.append(tokens)
    return sequences_processed

  def fit(self, sequences):
    self.fit_checker = False

    tokens = self.preprocessing(sequences)
    for seq in tokens:
      for item in seq:
        if item not in self.word_dict.keys():
          self.word_dict[item] = len(self.word_dict.keys())
    self.fit_checker = True

  def transform(self, sequences):
    result_sequences = []
    tokens = self.preprocessing(sequences)

    if self.fit_checker:
      for seq in tokens:
        temp_seq = []
        for item in seq:
          if item in self.word_dict.keys():
            temp_seq.append(self.word_dict[item])
          else:
            temp_seq.append(self.word_dict['oov'])
        result_sequences.append(temp_seq)
      return result_sequences
    else:
      raise Exception("Tokenizer instance is not fitted yet.")
      
  def fit_transform(self, sequences):
    self.fit(sequences)
    result = self.transform(sequences)
    return result

##문제2. TfidfVectorizer

In [2]:
import math

class TfidfVectorizer:
  def __init__(self, tokenizer):
    self.tokenizer = tokenizer
    self.fit_checker = False
  
  def fit(self, sequences):
    token_result = self.tokenizer.fit_transform(sequences)

    # idf_matrix 생성
    self.idf_matrix = [0] * len(self.tokenizer.word_dict)

    for item in token_result:
      for token_idx in self.tokenizer.word_dict.values():
        if token_idx in item:
          self.idf_matrix[token_idx] += 1
    self.idf_matrix = [math.log(len(sequences)/(value+1)) for value in self.idf_matrix]
    
    self.fit_checker = True
    

  def transform(self, sequences):
    if self.fit_checker:
      token_result = self.tokenizer.transform(sequences)
      self.tf_matrix = []

      for item in token_result:
        temp_list = []
        for token_idx in self.tokenizer.word_dict.values():
          temp_count = item.count(token_idx)
          temp_list.append(temp_count)
        self.tf_matrix.append(temp_list)

      # tf-idf 연산
      self.tfidf_matrix = []
      for list_tf in self.tf_matrix:
        self.tfidf_matrix.append([value_tf*value_idf for value_tf, value_idf in zip(list_tf, self.idf_matrix)])

      return self.tfidf_matrix

    else:
      raise Exception("TfidfVectorizer instance is not fitted yet.")

  
  def fit_transform(self, sequences):
    self.fit(sequences)
    return self.transform(sequences)

In [5]:
sentences= ['I love apple',
            'I love orange',
            'I love coffee',
            'coffe taste good',
            'I hate nothing']
tokenizer = Tokenizer()
result = tokenizer.fit_transform(sentences)
print(result[0])

[1, 2, 3]


In [6]:
tokenizer = Tokenizer()
tfidf = TfidfVectorizer(tokenizer)
tfidf_result = tfidf.fit_transform(sentences)
print(tfidf_result[0])

[0.0, 0.0, 0.22314355131420976, 0.9162907318741551, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
