# 01. 텍스트 분석 이해
### * 텍스트 분석 프로세스
1. 텍스트 전처리
2. 피처 벡터화/추출
3. ML 모델 수립 및 학습/예측/평가

### * 파이썬 기반의 NLP, 텍스트 분석 패키지
- NLTK
- Gensim
- SpaCy

# 02. 텍스트 사전 준비 작업 (텍스트 전처리) - 텍스트 정규화
### * 클렌징
### * 텍스트 토큰화 
#### 1) 문장 토큰화

In [1]:
from nltk import sent_tokenize
import nltk
nltk.download('punkt')

text_sample = 'The Matrix is everywhere its all around us, here even in this room. You can see it out your window or on your television. You feel it when you go to work, or go to church or pay your taxes.'
sentences = sent_tokenize(text=text_sample)
print(type(sentences), len(sentences))
print(sentences)

<class 'list'> 3
['The Matrix is everywhere its all around us, here even in this room.', 'You can see it out your window or on your television.', 'You feel it when you go to work, or go to church or pay your taxes.']


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jangseojin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
from nltk import word_tokenize

sentence = 'The Matrix is everywhere its all around us, here even in this room.'
words = word_tokenize(sentence)
print(type(words), len(words))
print(words)

<class 'list'> 15
['The', 'Matrix', 'is', 'everywhere', 'its', 'all', 'around', 'us', ',', 'here', 'even', 'in', 'this', 'room', '.']


In [3]:
from nltk import word_tokenize, sent_tokenize

def tokenize_text(text):
    sentences = sent_tokenize(text)
    word_tokens = [word_tokenize(sentence) for sentence in sentences]
    return word_tokens

word_tokens = tokenize_text(text_sample)
print(type(word_tokens), len(word_tokens))
print(word_tokens)

<class 'list'> 3
[['The', 'Matrix', 'is', 'everywhere', 'its', 'all', 'around', 'us', ',', 'here', 'even', 'in', 'this', 'room', '.'], ['You', 'can', 'see', 'it', 'out', 'your', 'window', 'or', 'on', 'your', 'television', '.'], ['You', 'feel', 'it', 'when', 'you', 'go', 'to', 'work', ',', 'or', 'go', 'to', 'church', 'or', 'pay', 'your', 'taxes', '.']]


### * 스톱 워드 제거

In [4]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jangseojin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
print('English stop words count:', len(nltk.corpus.stopwords.words('english')))
print(nltk.corpus.stopwords.words('english')[:20])

English stop words count: 179
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his']


In [6]:
import nltk

stopwords = nltk.corpus.stopwords.words('english')
all_tokens=[]
for sentence in word_tokens:
    filtered_words=[]
    for word in sentence:
        word=word.lower()
        if word not in stopwords:
            filtered_words.append(word)
    all_tokens.append(filtered_words)
    
print(all_tokens)

[['matrix', 'everywhere', 'around', 'us', ',', 'even', 'room', '.'], ['see', 'window', 'television', '.'], ['feel', 'go', 'work', ',', 'go', 'church', 'pay', 'taxes', '.']]


### * Stemming & Lemmatization

In [7]:
from nltk.stem import LancasterStemmer
stemmer=LancasterStemmer()
print(stemmer.stem('working'), stemmer.stem('works'), stemmer.stem('worked'))
print(stemmer.stem('amusing'), stemmer.stem('amuses'), stemmer.stem('amused'))
print(stemmer.stem('happier'), stemmer.stem('happiest'))
print(stemmer.stem('fancier'), stemmer.stem('fanciest'))

work work work
amus amus amus
happy happiest
fant fanciest


In [8]:
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')

lemma=WordNetLemmatizer()
print(lemma.lemmatize('amusing', 'v'), lemma.lemmatize('amuses', 'v'), lemma.lemmatize('amused', 'v'))
print(lemma.lemmatize('happier', 'a'), lemma.lemmatize('happiest', 'a'))
print(lemma.lemmatize('fancier', 'a'), lemma.lemmatize('fanciest', 'a'))

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jangseojin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


amuse amuse amuse
happy happy
fancy fancy


# 03. Bag of Words - BOW
### * BOW 피처 벡터화
- Count Vectorize : 카운트 값이 높을 수록 중요한 단어로 인식
- TF-IDF : 개별 문서에서 자주 나타나는 단어에 높은 가중치를 주되 언어의 특성 상 문장에 자주 사용되어 모든 문서에 전반적으로 자주 나타나는 단어에 대해서는 페널티 부과

### * 사이킷런의 Count 및 TF-IDF 벡터화 구현: CountVectorizer, TfidfVectorizer
### * BOW 벡터화를 위한 희소 행렬
### * 희소 행렬 - COO 형식

In [9]:
import numpy as np
dense = np.array([[3,0,1], [0,2,0]])

In [10]:
from scipy import sparse

data = np.array([3,1,2])

row_pos=np.array([0,0,1])
col_pos=np.array([0,2,1])

sparse_coo=sparse.coo_matrix((data, (row_pos, col_pos)))

In [11]:
sparse_coo.toarray()

array([[3, 0, 1],
       [0, 2, 0]])

### * 희소 행렬 - CSR 형식

In [12]:
[[0,0,1,0,0,5], [1,4,0,3,2,5], [0,6,0,3,0,0], [2,0,0,0,0,0], [0,0,0,7,0,8], [1,0,0,0,0,0]]

[[0, 0, 1, 0, 0, 5],
 [1, 4, 0, 3, 2, 5],
 [0, 6, 0, 3, 0, 0],
 [2, 0, 0, 0, 0, 0],
 [0, 0, 0, 7, 0, 8],
 [1, 0, 0, 0, 0, 0]]

In [13]:
from scipy import sparse

dense2=np.array([[0,0,1,0,0,5], [1,4,0,3,2,5], [0,6,0,3,0,0], [2,0,0,0,0,0], [0,0,0,7,0,8], [1,0,0,0,0,0]])
data2=np.array([1,5,1,4,3,2,5,6,3,2,7,8,1])
row_pos=np.array([0,0,1,1,1,1,1,2,2,3,4,4,5])
col_pos=np.array([2,5,0,1,2,4,5,1,3,0,3,5,0])

sparse_coo=sparse.coo_matrix((data2, (row_pos, col_pos)))

row_pos_ind=np.array([0,2,7,9,10,12,13])

sparse_csr=sparse.csr_matrix((data2, col_pos, row_pos_ind))

print('COO Dense')
print(sparse_coo.toarray())
print('CSR Dense')
print(sparse_csr.toarray())

COO Dense
[[0 0 1 0 0 5]
 [1 4 3 0 2 5]
 [0 6 0 3 0 0]
 [2 0 0 0 0 0]
 [0 0 0 7 0 8]
 [1 0 0 0 0 0]]
CSR Dense
[[0 0 1 0 0 5]
 [1 4 3 0 2 5]
 [0 6 0 3 0 0]
 [2 0 0 0 0 0]
 [0 0 0 7 0 8]
 [1 0 0 0 0 0]]


# 04. 텍스트 분류 실습 - 20 뉴스그룹 분류
### * 텍스트 정규화

In [14]:
from sklearn.datasets import fetch_20newsgroups
news_data=fetch_20newsgroups(subset='all', random_state=156)

In [15]:
print(news_data.keys())

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])


In [16]:
import pandas as pd

print('Target class value & dist \n', pd.Series(news_data.target).value_counts().sort_index())
print('\nTarget class names \n', news_data.target_names)

Target class value & dist 
 0     799
1     973
2     985
3     982
4     963
5     988
6     975
7     990
8     996
9     994
10    999
11    991
12    984
13    990
14    987
15    997
16    910
17    940
18    775
19    628
dtype: int64

Target class names 
 ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [17]:
print(news_data.data[0])

From: egreen@east.sun.com (Ed Green - Pixel Cruncher)
Subject: Re: Observation re: helmets
Organization: Sun Microsystems, RTP, NC
Lines: 21
Distribution: world
Reply-To: egreen@east.sun.com
NNTP-Posting-Host: laser.east.sun.com

In article 211353@mavenry.altcit.eskimo.com, maven@mavenry.altcit.eskimo.com (Norman Hamer) writes:
> 
> The question for the day is re: passenger helmets, if you don't know for 
>certain who's gonna ride with you (like say you meet them at a .... church 
>meeting, yeah, that's the ticket)... What are some guidelines? Should I just 
>pick up another shoei in my size to have a backup helmet (XL), or should I 
>maybe get an inexpensive one of a smaller size to accomodate my likely 
>passenger? 

If your primary concern is protecting the passenger in the event of a
crash, have him or her fitted for a helmet that is their size.  If your
primary concern is complying with stupid helmet laws, carry a real big
spare (you can put a big or small head in a big helmet, bu

In [18]:
from sklearn.datasets import fetch_20newsgroups

train_news=fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), random_state=156)
x_train=train_news.data
y_train=train_news.target

test_news=fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), random_state=156)
x_test=test_news.data
y_test=test_news.target

print('Train dataset size is {0}, Test dataset size is {1}'.format(len(train_news.data), len(test_news.data)))

Train dataset size is 11314, Test dataset size is 7532


### * 피처 벡터화 변환과 머신러닝 모델 학습/예측/평가

In [19]:
from sklearn.feature_extraction.text import CountVectorizer

cnt_vect=CountVectorizer()
cnt_vect.fit(x_train)
x_train_cnt_vect=cnt_vect.transform(x_train)

x_test_cnt_vect=cnt_vect.transform(x_test)

print('CountVectorizer Shape of train dataset text:', x_train_cnt_vect.shape)

CountVectorizer Shape of train dataset text: (11314, 101631)


In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr_clf=LogisticRegression(solver='liblinear')
lr_clf.fit(x_train_cnt_vect,y_train)
pred=lr_clf.predict(x_test_cnt_vect)
print('CountVectorized Logistic Regression accuracy score is {0:.3f}'.format(accuracy_score(y_test, pred)))

CountVectorized Logistic Regression accuracy score is 0.617




In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect=TfidfVectorizer()
tfidf_vect.fit(x_train)
x_train_tfidf_vect=tfidf_vect.transform(x_train)
x_test_tfidf_vect=tfidf_vect.transform(x_test)

lr_clf=LogisticRegression(solver='liblinear')
lr_clf.fit(x_train_tfidf_vect,y_train)
pred=lr_clf.predict(x_test_tfidf_vect)
print('TF-IDF Vectorized Logistic Regression accuracy score is {0:.3f}'.format(accuracy_score(y_test, pred)))

TF-IDF Vectorized Logistic Regression accuracy score is 0.678


In [22]:
tfidf_vect=TfidfVectorizer(stop_words='english', ngram_range=(1,2), max_df=300)
tfidf_vect.fit(x_train)
x_train_tfidf_vect=tfidf_vect.transform(x_train)
x_test_tfidf_vect=tfidf_vect.transform(x_test)

lr_clf=LogisticRegression(solver='liblinear')
lr_clf.fit(x_train_tfidf_vect,y_train)
pred=lr_clf.predict(x_test_tfidf_vect)
print('TF-IDF Vectorized Logistic Regression accuracy score is {0:.3f}'.format(accuracy_score(y_test, pred)))

TF-IDF Vectorized Logistic Regression accuracy score is 0.690


In [23]:
from sklearn.model_selection import GridSearchCV

params={'C':[0.01,0.1,1,5,10]}
grid_cv_lr=GridSearchCV(lr_clf, param_grid=params, cv=3, scoring='accuracy', verbose=1)
grid_cv_lr.fit(x_train_tfidf_vect, y_train)
print('Logistic Regression best C parameter :', grid_cv_lr.best_params_)

pred=grid_cv_lr.predict(x_test_tfidf_vect)
print('TF-IDF Vectorized Logistic Regression accuracy score is {0:.3f}'.format(accuracy_score(y_test, pred)))

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:  1.3min finished


Logistic Regression best C parameter : {'C': 10}
TF-IDF Vectorized Logistic Regression accuracy score is 0.704


### * Pipeline 사용 및 GridSearchCV와의 결합

In [24]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

pipeline=Pipeline([('tfidf_vect', TfidfVectorizer(stop_words='english')), 
                   ('lr_clf', LogisticRegression(random_state=156))])

In [25]:
pipeline=Pipeline([('tfidf_vect', TfidfVectorizer(stop_words='english', ngram_range=(1,2), max_df=300)), 
                   ('lr_clf', LogisticRegression(C=10, solver='liblinear'))])
pipeline.fit(x_train, y_train)
pred=pipeline.predict(x_test)
print('Pipeline Logistic Regression accuracy score is {0:.3f}'.format(accuracy_score(y_test, pred)))

Pipeline Logistic Regression accuracy score is 0.704


In [None]:
from sklearn.pipeline import Pipeline

pipeline=Pipeline([('tfidf_vect', TfidfVectorizer(stop_words='english')), 
                   ('lr_clf', LogisticRegression())])
params={'tfidf_vect__ngram_range':[(1,1),(1,2),(1,3)],
        'tfidf_vect__max_df':[100,300,700], 
        'lr_clf__C':[1,5,10]}
grid_cv_pipe=GridSearchCV(pipeline, param_grid=params, cv=3, scoring='accuracy', verbose=1)
grid_cv_pipe.fit(x_train, y_train)
print(grid_cv_pipe.best_params_, grid_cv_pipe.best_score_)

pred=grid_cv_pipe.predict(x_test)
print('Pipeline Logistic Regression accuracy score is {0:.3f}'.format(accuracy_score(y_test, pred)))

# 05. Sentiment Analysis
### * 감성 분석 소개
### * 지도학습 기반 감성 분석 실습 - IMDB 영화평

In [3]:
import pandas as pd

review_df=pd.read_csv('./word2vec-nlp-tutorial/labeledTrainData.tsv', header=0, sep='\t',quoting=3)
review_df.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [4]:
print(review_df['review'][0])

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally sta

In [5]:
import re

review_df['review']=review_df['review'].str.replace('<br />', ' ')
review_df['review']=review_df['review'].apply(lambda x : re.sub('[^a-zA-Z]', ' ', x))

In [6]:
from sklearn.model_selection import train_test_split

class_df=review_df['sentiment']
feature_df=review_df.drop(['id', 'sentiment'], axis=1, inplace=False)

x_train, x_test, y_train, y_test=train_test_split(feature_df, class_df, test_size=0.3, random_state=156)
x_train.shape, x_test.shape

((17500, 1), (7500, 1))

In [32]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

pipeline=Pipeline([('cnt_vect', CountVectorizer(stop_words='english', ngram_range=(1,2))), 
                   ('lr_clf', LogisticRegression(C=10, solver='liblinear'))])
pipeline.fit(x_train['review'], y_train)
pred=pipeline.predict(x_test['review'])
pred_probs=pipeline.predict_proba(x_test['review'])[:,1]
print('Accuracy score is {0:.4f}, ROC-AUC is {1:.4f}'.format(accuracy_score(y_test, pred), roc_auc_score(y_test, pred_probs)))

Accuracy score is 0.8861, ROC-AUC is 0.9503


In [33]:
pipeline=Pipeline([('tfidf_vect', TfidfVectorizer(stop_words='english', ngram_range=(1,2))), 
                   ('lr_clf', LogisticRegression(C=10, solver='liblinear'))])
pipeline.fit(x_train['review'], y_train)
pred=pipeline.predict(x_test['review'])
pred_probs=pipeline.predict_proba(x_test['review'])[:,1]
print('Accuracy score is {0:.4f}, ROC-AUC is {1:.4f}'.format(accuracy_score(y_test, pred), roc_auc_score(y_test, pred_probs)))

Accuracy score is 0.8936, ROC-AUC is 0.9598


### * 비지도 학습 기반 감성 분석 소개
### * SentiWordNet을 이용한 감성 분석
### * VADER를 이용한 감성 분석

In [10]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

senti_analyzer=SentimentIntensityAnalyzer()
senti_scores=senti_analyzer.polarity_scores(review_df['review'][0])
print(senti_scores)

{'neg': 0.13, 'neu': 0.743, 'pos': 0.127, 'compound': -0.7943}


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/jangseojin/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [15]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
import numpy as np

def vader_polarity(review, threshold=0.1):
    analyzer=SentimentIntensityAnalyzer()
    scores=analyzer.polarity_scores(review)
    
    agg_score=scores['compound']
    final_sentiment = 1 if agg_score>=threshold else 0
    return final_sentiment

review_df['vader_preds']=review_df['review'].apply(lambda x : vader_polarity(x, 0.1))
y_target=review_df['sentiment'].values
vader_preds=review_df['vader_preds'].values

print(confusion_matrix(y_target, vader_preds))
print('Accuracy:', np.round(accuracy_score(y_target, vader_preds),4))
print('Precision:', np.round(precision_score(y_target, vader_preds),4))
print('Recall:', np.round(recall_score(y_target, vader_preds),4))

[[ 6730  5770]
 [ 1857 10643]]
Accuracy: 0.6949
Precision: 0.6484
Recall: 0.8514


# 06. Topic Modeling

In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

cats=['rec.motorcycles','rec.sport.baseball','comp.graphics','comp.windows.x',
     'talk.politics.mideast','soc.religion.christian','sci.electronics','sci.med']
news_df=fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'), categories=cats, random_state=0)
count_vect=CountVectorizer(max_df=0.95, max_features=1000, min_df=2, stop_words='english', ngram_range=(1,2))
feat_vect=count_vect.fit_transform(news_df.data)
print('CountVectorizer Shape:', feat_vect.shape)

CountVectorizer Shape: (7862, 1000)


In [2]:
lda=LatentDirichletAllocation(n_components=8, random_state=0)
lda.fit(feat_vect)
print(lda.components_.shape)
lda.components_

(8, 1000)


array([[3.60992018e+01, 1.35626798e+02, 2.15751867e+01, ...,
        3.02911688e+01, 8.66830093e+01, 6.79285199e+01],
       [1.25199920e-01, 1.44401815e+01, 1.25045596e-01, ...,
        1.81506995e+02, 1.25097844e-01, 9.39593286e+01],
       [3.34762663e+02, 1.25176265e-01, 1.46743299e+02, ...,
        1.25105772e-01, 3.63689741e+01, 1.25025218e-01],
       ...,
       [3.60204965e+01, 2.08640688e+01, 4.29606813e+00, ...,
        1.45056650e+01, 8.33854413e+00, 1.55690009e+01],
       [1.25128711e-01, 1.25247756e-01, 1.25005143e-01, ...,
        9.17278769e+01, 1.25177668e-01, 3.74575887e+01],
       [5.49258690e+01, 4.47009532e+00, 9.88524814e+00, ...,
        4.87048440e+01, 1.25034678e-01, 1.25074632e-01]])

In [5]:
def display_topics(model, feature_names, no_top_words):
    for topic_index, topic in enumerate(model.components_):
        print('Topic #', topic_index)
        
        topic_word_indexes=topic.argsort()[::-1]
        top_indexes=topic_word_indexes[:no_top_words]
        
        feature_concat=' '.join([feature_names[i] for i in top_indexes])
        print(feature_concat)
        
feature_names=count_vect.get_feature_names()
display_topics(lda, feature_names, 15)

Topic # 0
year 10 game medical health team 12 20 disease cancer 1993 games years patients good
Topic # 1
don just like know people said think time ve didn right going say ll way
Topic # 2
image file jpeg program gif images output format files color entry 00 use bit 03
Topic # 3
like know don think use does just good time book read information people used post
Topic # 4
armenian israel armenians jews turkish people israeli jewish government war dos dos turkey arab armenia 000
Topic # 5
edu com available graphics ftp data pub motif mail widget software mit information version sun
Topic # 6
god people jesus church believe christ does christian say think christians bible faith sin life
Topic # 7
use dos thanks windows using window does display help like problem server need know run


# 07. Document Clustering
### * Opinion Review dataset을 이용한 Document Clustering

In [5]:
import pandas as pd
import glob, os

path = "./OpinosisDataset1.0/topics"
all_files = glob.glob(os.path.join(path,"*.data"))

filename_list = []
opinion_text = []

for file_ in all_files:
    df = pd.read_table(file_,index_col=None, header=0, encoding='latin1')
    filename_ = file_.split('/')[-1]
    filename = filename_.split('.')[0]
    
    filename_list.append(filename)
    opinion_text.append(str(df))

document_df=pd.DataFrame({'filename': filename_list, 'opinion_text': opinion_text})
document_df.head()

Unnamed: 0,filename,opinion_text
0,battery-life_ipod_nano_8gb,short battery life I moved up from a...
1,gas_mileage_toyota_camry_2007,Ride seems comfortable and gas mileage fa...
2,room_holiday_inn_london,"We arrived at 23,30 hours and they could n..."
3,location_holiday_inn_london,Great location for tube and we crammed in...
4,staff_bestwestern_hotel_sfo,Staff are friendly and hel...


In [54]:
def LemTokens(tokens):
    lemmer = nltk.stem.WordNetLemmatizer()
    return [lemmer.lemmatize(token) for token in tokens]
    # remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)

def LemNormalize(text):
    remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect=TfidfVectorizer(tokenizer=LemNormalize, stop_words='english', ngram_range=(1,2), min_df=0.05, max_df=0.85)
feature_vect=tfidf_vect.fit_transform(document_df['opinion_text'])

### * 군집별 핵심 단어 추출하기

# 08. 문서 유사도
### * 문서 유사도 측정 방법 - 코사인 유사도

In [96]:
import numpy as np

def cos_similarity(v1, v2):
    dot_product=np.dot(v1,v2)
    l2_norm=(np.sqrt(sum(np.square(v1)))*np.sqrt(sum(np.square(v2))))
    similarity=dot_product/l2_norm
    
    return similarity

In [98]:
from sklearn.feature_extraction.text import TfidfVectorizer

doc_list=['if you take the blue pill, the story ends',
         'if you take the red pill, you stay in Wonderland',
         'if you take the red pill, I show you how deep the rabbit hole goes']

tfidf_vect_simple=TfidfVectorizer()
feature_vect_simple=tfidf_vect_simple.fit_transform(doc_list)
print(feature_vect_simple.shape)

(3, 18)


In [99]:
feature_vect_dense=feature_vect_simple.todense()
vect1=np.array(feature_vect_dense[0]).reshape(-1,)
vect2=np.array(feature_vect_dense[1]).reshape(-1,)
similarity_simple=cos_similarity(vect1, vect2)
print('Sentence 1, Sentence 2 Cosine Similarity: {0:.3f}'.format(similarity_simple))

Sentence 1, Sentence 2 Cosine Similarity: 0.402


In [100]:
vect1=np.array(feature_vect_dense[0]).reshape(-1,)
vect3=np.array(feature_vect_dense[2]).reshape(-1,)
similarity_simple=cos_similarity(vect1, vect3)
print('Sentence 1, Sentence 3 Cosine Similarity: {0:.3f}'.format(similarity_simple))

vect2=np.array(feature_vect_dense[1]).reshape(-1,)
vect3=np.array(feature_vect_dense[2]).reshape(-1,)
similarity_simple=cos_similarity( vect2, vect3)
print('Sentence 2, Sentence 3 Cosine Similarity: {0:.3f}'.format(similarity_simple))

Sentence 1, Sentence 3 Cosine Similarity: 0.404
Sentence 2, Sentence 3 Cosine Similarity: 0.456


In [101]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_simple_pair=cosine_similarity(feature_vect_simple[0], feature_vect_simple)
print(similarity_simple_pair)

[[1.         0.40207758 0.40425045]]


In [102]:
similarity_simple_pair=cosine_similarity(feature_vect_simple[0], feature_vect_simple[1:])
print(similarity_simple_pair)

[[0.40207758 0.40425045]]


In [103]:
similarity_simple_pair=cosine_similarity(feature_vect_simple, feature_vect_simple)
print(similarity_simple_pair)
print('Shape:', similarity_simple_pair.shape)

[[1.         0.40207758 0.40425045]
 [0.40207758 1.         0.45647296]
 [0.40425045 0.45647296 1.        ]]
Shape: (3, 3)


### * Opinion Review dataset를 이용한 문서 유사도 측정

# 09. 한글 텍스트 처리
### * 한글 NLP 처리의 어려움
### * KoNLPy 
### * 데이터 로딩

In [1]:
import pandas as pd

train_df=pd.read_csv('ratings_train.txt', sep='\t')
train_df.head()

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1


In [2]:
train_df['label'].value_counts()

0    75173
1    74827
Name: label, dtype: int64

In [3]:
import re

train_df=train_df.fillna(' ')
train_df['document']=train_df['document'].apply(lambda x : re.sub(r'\d+','',x))

test_df=pd.read_csv('ratings_test.txt', sep='\t')
test_df=test_df.fillna(' ')
test_df['document']=test_df['document'].apply(lambda x : re.sub(r'\d+','',x))

train_df.drop('id', axis=1, inplace=True)
test_df.drop('id', axis=1, inplace=True)

In [5]:
from konlpy.tag import Okt

twitter=Okt()
def tw_tokenizer(text):
    tokens_ko=twitter.morphs(text)
    return tokens_ko
tw_tokenizer('아빠가방에들어가신다')

['아빠', '가방', '에', '들어가신다']

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

tfidf_vect=TfidfVectorizer(tokenizer=tw_tokenizer, ngram_range=(1,2), min_df=3, max_df=0.9)
tfidf_vect.fit(train_df['document'])
tfidf_matrix_train=tfidf_vect.transform(train_df['document'])

In [10]:
lg_clf=LogisticRegression(random_state=0)
params={'C':[1,3.5,4.5,5.5,10]}
grid_cv=GridSearchCV(lg_clf, param_grid=params, cv=3, scoring='accuracy', verbose=1)
grid_cv.fit(tfidf_matrix_train, train_df['label'])
print(grid_cv.best_params_, round(grid_cv.best_score_, 4))

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/pre

{'C': 3.5} 0.859


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
from sklearn.metrics import accuracy_score

tfidf_matrix_test=tfidf_vect.transform(test_df['document'])
best_estimator=grid_cv
preds= best_estimator.predict(tfidf_matrix_test)

print('Logistic Regression Accuracy:', accuracy_score(test_df['label'], preds))

Logistic Regression Accuracy: 0.86192


# 10. 텍스트 분석 실습 - 캐글 Mercari Price Suggestion Challenge