<a href="https://colab.research.google.com/github/Soy-code/Code-Up/blob/master/LatentSemanticIndexingModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## 잠재의미분석(LSA/LSI)
- DTM의 잠재된(Latent) 의미를 이끌어내는 방법
- 기존의 DTM이나 DTM에 단어의 중요도에 따른 가중치를 주었던 TF-IDF 행렬은 단어의 의미를 전혀 고려하지 못한다는 단점이 있음
- LSA는 DTM이나 TF-IDF 행렬에 truncated SVD 사용하여 차원을 축소시키고, 단어들의 잠재적인 의미를 끌어낸다는 아이디어


In [None]:
# 실습
import numpy as np
A = np.array([[0,0,0,1,0,1,1,0,0],
              [0,0,0,1,1,0,1,0,0],
              [0,1,1,0,2,0,0,0,0],
              [1,0,0,0,0,0,0,1,1]])
print(A.shape)

# full SVD
U, s, VT = np.linalg.svd(A, full_matrices = True) # U : 직교행렬, s : 대각행렬, VT: V의 전치행렬
print(U.shape, s.shape, VT.shape)
print(U.round(2))
print(s.round(2))
print(VT.round(2))

# s를 대각행렬로 만들기
S = np.zeros((4, 9))
S[:4, :4] = np.diag(s)
print(S.round(2))

In [None]:
# Truncated SVD
t = 2
S = S[:t, :t]
print(S.round(2))

U = U[:, :t]
print(U.round(2))

VT = VT[:t, :]
print(VT.round(2))

In [None]:
A_prime =np.dot(np.dot(U, S), VT)
print(A)
print(A_prime.round(2))

In [None]:
# 뉴스그룹 예시
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [None]:
dataset = fetch_20newsgroups(shuffle= True, random_state = 1,
                             remove = ('headers', 'footers', 'quotes'))
documents = dataset.data

In [None]:
news_df = pd.DataFrame({'document': documents})
news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]", " ")
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w) > 3]))
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower())

In [None]:
stop_words = stopwords.words('english')
tokenized_doc = news_df['clean_doc'].apply(lambda x: x.split())  # 토큰화
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])

In [None]:
# tf-idf 행렬 만들기
detokenized_doc = []
for i in range(len(news_df)) :
  t = ' '.join(tokenized_doc[i])
  detokenized_doc.append(t)

news_df['clean_doc'] = detokenized_doc

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words = 'english', max_features = 1000,
                             max_df = 0.5, smooth_idf =True)
X = vectorizer.fit_transform(news_df['clean_doc'])
print(X.shape)

In [None]:
# 토픽 모델링
svd_model = TruncatedSVD(n_components=20, algorithm='randomized', n_iter=100, random_state=122)
svd_model.fit(X)
len(svd_model.components_)

In [None]:
terms = vectorizer.get_feature_names_out() # 단어 집합. 1,000개의 단어가 저장됨.

def get_topics(components, feature_names, n=5):
    for idx, topic in enumerate(components):
        print("Topic %d:" % (idx+1), [(feature_names[i], topic[i].round(5)) for i in topic.argsort()[:-n - 1:-1]])
get_topics(svd_model.components_,terms)

# Gensim모델

In [None]:
!pip install gensim

In [None]:
import pandas as pd
import gensim
from gensim.parsing.preprocessing import preprocess_documents
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

https://medium.com/betacom/latent-semantic-indexing-in-python-85880414b4de

https://www.projectpro.io/recipes/create-lsi-topic-model-gensim

https://www.analyticsvidhya.com/blog/2018/10/stepwise-guide-topic-modeling-latent-semantic-analysis/

In [None]:
import csv

In [None]:
f = open('/content/sample_data/wiki_movie_plots_deduped.csv', encoding = 'utf8')

In [None]:
reader = csv.reader(f)
csv_list = []
for l in reader :
  csv_list.append(l)
f.close()

df = pd.DataFrame(csv_list)

In [None]:
df.columns = df.iloc[0]
df = df.drop(0)
df.reset_index(drop = True, inplace = True)

In [None]:
df.columns

In [None]:
df['Release Year'] = df['Release Year'].astype(int)

In [None]:
df = df[df['Release Year'] >= 2000]
new_df = df[df['Release Year'] < 2000]
text_corpus =df['Plot'].values

In [None]:
processed_corpus = preprocess_documents(text_corpus)
dictionary = gensim.corpora.Dictionary(processed_corpus)
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]

In [None]:
tfidf = gensim.models.TfidfModel(bow_corpus, smartirs = 'npu')
corpus_tfidf = tfidf[bow_corpus]

In [None]:
lsi = gensim.models.LsiModel(corpus_tfidf, num_topics=200)
index = gensim.similarities.MatrixSimilarity(lsi[corpus_tfidf])

In [None]:
new_doc = gensim.parsing.preprocessing.preprocess_string(new_doc)
new_vec = dictionary.doc2bow(new_doc)
vec_bow_tfidf = tfidf[new_vec]
vec_lsi = lsi[vec_bow_tfidf]
sims = index[vec_lsi]
for s in sorted(enumerate(sims), key=lambda item: -item[1])[:10]:
    print(f”{df[‘Title’].iloc[s[0]]} : {str(s[1])}”)

In [None]:
documents = [
    "Gensim is a powerful library for natural language processing in Python.",
    "LSI model can be used for topic modeling in Gensim.",
    "Topic modeling is useful to discover hidden semantic patterns in text data."
]

tokenized_documents = [word_tokenize(doc.lower()) for doc in documents]
dictionary = gensim.corpora.Dictionary(tokenized_documents)
corpus = [dictionary.doc2bow(tokenized_doc) for tokenized_doc in tokenized_documents]
lsi_model = gensim.models.LsiModel(corpus, id2word=dictionary, num_topics= 1)
topics = lsi_model.print_topics()
for topic in topics:
    print(topic)

In [None]:
def Find_Optimal_Cutoff(target, predicted):
      """ Find the optimal probability cutoff point for a classification model related to event rate
      Parameters
      ----------
      target : Matrix with dependent or target data, where rows are observations
      predicted : Matrix with predicted data, where rows are observations
      Returns
      -------
      list type, with optimal cutoff value
      """
      fpr, tpr, threshold = roc_curve(target, predicted)
      i = np.arange(len(tpr))
      roc = pd.DataFrame({'tf' : pd.Series(tpr-(1-fpr), index=i), 'threshold' : pd.Series(threshold, index=i)})
      roc_t = roc.ix[(roc.tf-0).abs().argsort()[:1]]

      return list(roc_t['threshold'])