<a href="https://colab.research.google.com/github/Shin-kyoto/www/blob/develop/HW01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#必要なパッケージのインストールとDriveのマウント

In [None]:
# 必要なパッケージのインストール
!pip install nltk
!pip install gensim

In [None]:
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import wordnet as wn #lemmatize関数のためのimport
import re

In [None]:
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("punkt")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
cd drive/My Drive/Colab Notebooks/M1

In [None]:
!ls

#DataFrameの作成


In [None]:
df = pd.read_csv("./nlp_techwords.csv")
df

In [None]:
df2=df.iloc[[3,8],[0,1]]
df2

#前処理

In [None]:
def preprocessing_text(text):
  def cleaning_text(text):
    # @の削除
    pattern1 = '@|%'
    text = re.sub(pattern1, '', text)    
    pattern2 = '\[[0-9 ]*\]'
    text = re.sub(pattern2, '', text)    
    # <b>タグの削除
    pattern3 = '\([a-z ]*\)'
    text = re.sub(pattern3, '', text)    
    pattern4 = '[0-9]'
    text = re.sub(pattern4, '', text)
    #'('または')'の削除
    pattern5 ='\(|\)' 
    text = re.sub(pattern5,'',text)
    return text
  
  def tokenize_text(text):
    text = re.sub('[.,]', '', text)
    return text.split()

  def lemmatize_word(word):
    # make words lower  example: Python =>python
    word=word.lower()
    
    # lemmatize  example: cooked=>cook
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
      return lemma
    
  text = cleaning_text(text)
  tokens = tokenize_text(text)
  tokens = [lemmatize_word(word) for word in tokens]
  tokens = [remove_stopwords(word, en_stop) for word in tokens]
  tokens = [word for word in tokens if word is not None]
  return tokens


In [None]:
#nltkのストップワードリスト
en_stop = nltk.corpus.stopwords.words('english')
print(en_stop)

In [None]:
docs = df["Abstract"].values
pp_docs = [preprocessing_text(text) for text in docs]

In [None]:
pp_docs[0]

#集合ベースで文章を表現し，Jaccard係数で類似度計算

##集合ベースで文章を表現

In [None]:
for i,doc in zip(range(10),pp_docs):
  #execは，文字列をpython文として出力する．
  #execを使えば，変数名を定義するときにformatメソッドが使える
  exec('set_{}={}'.format(i,set(doc)))
  #print('set_{}'.format(i))
  #exec('print(set_{})'.format(i))

##jaccard係数を定義

In [None]:
def jaccard_similarity(set_a,set_b):
  # 積集合の要素数を計算
  num_intersection = len(set.intersection(set_a, set_b))
  # 和集合の要素数を計算
  num_union = len(set.union(set_a, set_b))
  #Jaccard係数を算出　空集合の時は1を出力
  try:
      return float(num_intersection) / num_union
  except ZeroDivisionError:
      return 1.0 

def dice_similarity(set_a, set_b):
  num_intersection =  len(set.intersection(set_a, set_b))
  sum_nums = len(set_a) + len(set_b)
  try:
    return 2 * num_intersection / sum_nums
  except ZeroDivisionError:
    return 1.0 

def simpson_similarity(list_a, list_b):
  num_intersection = len(set.intersection(set(list_a), set(list_b)))
  min_num = min(len(set(list_a)), len(set(list_b)))
  try:
    return num_intersection / min_num
  except ZeroDivisionError:
    if num_intersection == 0:
      return 1.0
    else:
      return 0

##類似度計算

In [None]:
print("jaccard(0, 1) = ", jaccard_similarity(set_0, set_1)) #Jaccard係数を計算

In [None]:
def sim_1dic(num,sim):
  i=num
  for j in range(10):
    exec('print("{}({}, {}) = ",{}_similarity(set_{}, set_{}))'.format(sim,i,j,sim,i,j))

In [None]:
for i in range(1):
  for j in range(10):
    exec('print("jaccard({}, {}) = ",jaccard_similarity(set_{}, set_{}))'.format(i,j,i,j))

In [None]:
sim_1dic(0,'jaccard')

In [None]:
sim_1dic(0,'dice')

In [None]:
sim_1dic(0,'simpson')

In [None]:
num=1
sim_1dic(num,'jaccard')
sim_1dic(num,'dice')
sim_1dic(num,'simpson')

#ベクトルベースで文章を表現し，TF-IDFベクトルのコサイン類似度表現

##ベクトルベースで文章を表現

In [None]:
def bow_vectorizer(docs):
  word2id = {}
  for doc in docs:
    for w in doc:
      if w not in word2id:
        word2id[w] = len(word2id)
        
  result_list = []
  for doc in docs:
    doc_vec = [0] * len(word2id)
    for w in doc:
      doc_vec[word2id[w]] += 1
    result_list.append(doc_vec)
  return result_list, word2id

In [None]:
bow_vec, word2id = bow_vectorizer(pp_docs)
print(bow_vec)

In [None]:
len(bow_vec[0])

##TF-IDF値を定義

In [None]:
def tfidf_vectorizer(docs):
  def tf(word2id, doc):
    term_counts = np.zeros(len(word2id))
    for term in word2id.keys():
      term_counts[word2id[term]] = doc.count(term)
    tf_values = list(map(lambda x: x/sum(term_counts), term_counts))
    return tf_values
  
  def idf(word2id, docs):
    idf = np.zeros(len(word2id))
    for term in word2id.keys():
      idf[word2id[term]] = np.log(len(docs) / sum([bool(term in doc) for doc in docs]))
    return idf
  
  word2id = {}
  for doc in docs:
    for w in doc:
      if w not in word2id:
        word2id[w] = len(word2id)
  
  return [[_tf*_idf for _tf, _idf in zip(tf(word2id, doc), idf(word2id, docs))] for doc in docs], word2id

In [None]:
tfidf_vector, word2id = tfidf_vectorizer(pp_docs)
print(tfidf_vector)
print(word2id.items())

In [None]:
len(tfidf_vector[0])

##コサイン類似度計算

In [None]:
def cosine_similarity(list_a, list_b):
  # あとで消す
  inner_prod = np.array(list_a).dot(np.array(list_b))
  norm_a = np.linalg.norm(list_a)
  norm_b = np.linalg.norm(list_b)
  try:
      return inner_prod / (norm_a*norm_b)
  except ZeroDivisionError:
      return 1.0

In [None]:
print("cosine_similarity(docs[{}], docs[{}]) = ".format(0,1),cosine_similarity(tfidf_vector[0], tfidf_vector[1]))

In [None]:
i=0
for j in range(10):
  print("cosine_similarity(docs[{}], docs[{}]) = ".format(0,j),cosine_similarity(tfidf_vector[0], tfidf_vector[j]))


# 集合ベースとベクトルベースの比較

集合演算の方は一つ一つの文書が小さいデータに対して性能が高い  
文書がある程度大きくなるとベクトルベースの方が有用になる  
その代わり、語彙集合が大きくなり計算量が大きくなってしまう


In [None]:
#集合ベース
num=0
sim_1dic(num,'jaccard')
sim_1dic(num,'dice')
sim_1dic(num,'simpson')

In [None]:
#ベクトルベース
i=0
for j in range(10):
  print("cosine_similarity(docs[{}], docs[{}]) = ".format(0,j),cosine_similarity(tfidf_vector[0], tfidf_vector[j]))