# 목표 : 머신러닝 돌리기

In [1]:
import pandas as pd

# 직접 분류한 리뷰로 머신러닝 돌리기 => 정확도 95%

In [10]:
sentiment_1 = pd.read_csv("./data/sentiment_1.csv")

In [11]:
sentiment_1 = sentiment_1.drop(["Unnamed: 0"], axis=1)

In [12]:
sentiment_1.head(3)

Unnamed: 0,review,P/N
0,We’d never had Korean before and I’d been want...,1
1,I really was unsure of how much of the menu wo...,1
2,Absolutely delicious authentic Korean food ser...,1


In [7]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [13]:
import re
import pandas as pd
from time import time

# 전처리 작업을 위해 호출될 함수
def preprocessor(text) :
    # 문자열의 내의 html 태그를 삭제한다.
    # 문자열에서 이모티콘을 찾아낸다.
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)|\^.?\^', str(text))
    # 문장에서 특수문자를 제거하고
    # 문자열을 소문자로 변하고
    # 추출한 이모티콘을 붙혀준다.
    text = re.sub('[\W]+', ' ', str(text).lower() + ' '.join(emoticons).replace('-', ''))
    # print(text)
    return text

In [14]:
sentiment_1["review"] = sentiment_1["review"].apply(preprocessor)

In [15]:
sentiment_1.to_csv("./data/refined_review.csv", index=False)

In [16]:
refined_review = pd.read_csv("./data/refined_review.csv")
refined_review.head()

Unnamed: 0,review,P/N
0,we d never had korean before and i d been want...,1
1,i really was unsure of how much of the menu wo...,1
2,absolutely delicious authentic korean food ser...,1
3,the banchan or side dishes that they serve are...,1
4,my eleven year old twins beg to get biminbop o...,1


In [74]:
# def step2_preprocessing() :
#     # csv 데이터를 읽어온다.
#     df = pd.read_csv('./data/trip_final.csv')

#     # 전처리 작업
#     stime = time()
#     print('전처리 시작')
#     df["review"] = df['review'].apply(preprocessor)
#     print('전처리 완료')
#     print('소요시간 : %d' % (time() - stime))

#     # 전처리된 데이터를 저장한다.
#     df.to_csv('./data/pre_review.csv', index=False)

In [82]:
from sklearn.model_selection import train_test_split

In [78]:
# # 평점 전처리
# def star_proprocessing(text) :
#     value = int(text)
#     if value <= 3.0 :
#         return '0'
#     else :
#         return '1'

In [83]:
# def step2_preprocessing():
#     # 수집한 데이터를 읽어온다.
#     df = pd.read_csv('./data/trip_final.csv')
#     # print(df)

#     # 전처리 과정
#     df['rating'] = df['rating'].apply(star_proprocessing)
#     # 학습 데이터와 테스트 데이터로 나눈다.
#     text_list = df['review'].tolist()
#     star_list = df['rating'].tolist()

#     text_train, text_test, star_train, star_test = train_test_split(text_list, star_list, test_size=0.3, random_state=0)
#     #print(len(text_train))
#     #print(len(text_test))
#     #print(len(star_train))
#     #print(len(star_test))

#     # 저장한다.
#     dic_train = {
#         'text' : text_train,
#         'star' : star_train
#     }
#     df_tran = pd.DataFrame(dic_train)

#     dic_test = {
#         'text' : text_test,
#         'star' : star_test
#     }
#     df_test = pd.DataFrame(dic_test)

#     df_tran.to_csv('./data/trip_train_data.csv', index=False)
#     df_test.to_csv('./data/trip_test_data.csv', index=False)

In [84]:
step2_preprocessing()

In [17]:
# step3_word_tokenizer.py
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import nltk

# stopword 단어 사전을 다운로드 받는다.
nltk.download('stopwords')
# stopword 데이터를 가져온다.
stop = stopwords.words('english')
# 단어 줄기를 하기위한 객체
porter = PorterStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
# 공백으로 단어분리
def tokenizer(text) :
    return text.split()

In [19]:
# 단어줄기
def tokenizer_porter(text) :
    return [porter.stem(word) for word in text.split()]

In [88]:
def tokenizer_stopwordsr(text) :
    # 띄어쓰기를 기준으로 분리한다.
    word_list = text.split()
    #단어 줄기 처리
    word_list2 = \
        [porter.stem(word) for word in word_list]
    #불용어 처리
    result = []
    for w in word_list2: 
        if w not in stop: 
            result.append(w)
    return result

In [90]:
def step3_word_tokenizer() :
    text = 'runners like running and thus they run'

    a1 = tokenizer(text)
    a2 = tokenizer_porter(text)
    print('a1 :', a1)
    print('a2 :', a2)

In [40]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
import pickle
from time import time
import pandas as pd
import os
from sklearn.metrics import accuracy_score

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [33]:
from sklearn.linear_model import LogisticRegression

In [37]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [41]:
def step4_learning() :
    # csv 파일에서 데이터를 읽어온다.
    df = pd.read_csv('./data/refined_review.csv')
    # 테스트, 학습데이터로 나눈다.
    X_train = df.loc[:700 - 1, 'review'].values
    y_train = df.loc[:700 - 1, 'P/N'].values

    X_test = df.loc[300:, 'review'].values
    y_test = df.loc[300:, 'P/N'].values

    # 단어장을 만들어주는 객체 생성
    tfidf = TfidfVectorizer(lowercase=False, tokenizer=tokenizer)
    # tfidf = TfidfVectorizer(lowercase=False, tokenizer=tokenizer_stopwordsr)
    # 데이터를 학습하기 위한 객체
    logistic = LogisticRegression(C=10.0, penalty='l2', random_state=0)
    # 파이프 라인 설정
    pipeline = Pipeline([('vect', tfidf), ('clf', logistic)])

    # 학습한다.
    stime = time()
    print('학습 시작')
    pipeline.fit(X_train, y_train)
    print('학습 종료')
    print('총 학습시간 : %d' % (time() - stime))

    # 테스트
    y_pred = pipeline.predict(X_test)
    print("정확도 : %.3f" % accuracy_score(y_test, y_pred))

    # 성능 확인
    y_true = y_test
    y_hat = y_pred
    print("R2 score : ", r2_score(y_true, y_hat))
    print("mean_absolute_error : ", mean_absolute_error(y_true, y_hat))
    print("mean_squared_error : ", mean_squared_error(y_true, y_hat))

    # 학습이 완료된 객체를 저장한다.
    with open('./data/trip.dat', 'wb') as fp :
        pickle.dump(pipeline, fp)

    print('저장완료')

In [42]:
step4_learning()

학습 시작
학습 종료
총 학습시간 : 0
정확도 : 0.951
R2 score :  0.4861405405405407
mean_absolute_error :  0.04864091559370529
mean_squared_error :  0.04864091559370529
저장완료


# 별점으로 긍정부정 머신러닝 학습시키기

In [43]:
sentiment_2 = pd.read_csv("./data/sentiment_2.csv")

In [45]:
sentiment_2 = sentiment_2.drop(["Unnamed: 0"], axis=1)

In [46]:
sentiment_2["review"] = sentiment_2["review"].apply(preprocessor)

In [47]:
sentiment_2.to_csv("./data/refined_review_2.csv", index=False)

In [60]:
refined_review_2 = refined_review_2.dropna(axis=0)

In [61]:
refined_review_2.to_csv("./data/refined_review_2.csv", index=False)

In [62]:
refined_review_2 = pd.read_csv("./data/refined_review_2.csv")
refined_review_2.head()

Unnamed: 0,review,P/N
0,we d never had korean before and i d been want...,1
1,i really was unsure of how much of the menu wo...,1
2,absolutely delicious authentic korean food ser...,1
3,the banchan or side dishes that they serve are...,1
4,my eleven year old twins beg to get biminbop o...,1


In [63]:
refined_review_2.isnull().sum()

review    0
P/N       0
dtype: int64

In [None]:
refined_review_2

In [64]:
def step4_learning() :
    # csv 파일에서 데이터를 읽어온다.
    df = pd.read_csv('./data/refined_review_2.csv')
    # 테스트, 학습데이터로 나눈다.
    X_train = df.loc[:35000, 'review'].values
    y_train = df.loc[:35000, 'P/N'].values

    X_test = df.loc[15000:, 'review'].values
    y_test = df.loc[15000:, 'P/N'].values

    # 단어장을 만들어주는 객체 생성
    tfidf = TfidfVectorizer(lowercase=False, tokenizer=tokenizer)
    # tfidf = TfidfVectorizer(lowercase=False, tokenizer=tokenizer_stopwordsr)
    # 데이터를 학습하기 위한 객체
    logistic = LogisticRegression(C=10.0, penalty='l2', random_state=0)
    # 파이프 라인 설정
    pipeline = Pipeline([('vect', tfidf), ('clf', logistic)])

    # 학습한다.
    stime = time()
    print('학습 시작')
    pipeline.fit(X_train, y_train)
    print('학습 종료')
    print('총 학습시간 : %d' % (time() - stime))

    # 테스트
    y_pred = pipeline.predict(X_test)
    print("정확도 : %.3f" % accuracy_score(y_test, y_pred))

    # 성능 확인
    y_true = y_test
    y_hat = y_pred
    print("R2 score : ", r2_score(y_true, y_hat))
    print("mean_absolute_error : ", mean_absolute_error(y_true, y_hat))
    print("mean_squared_error : ", mean_squared_error(y_true, y_hat))

    # 학습이 완료된 객체를 저장한다.
    with open('./data/trip_2.dat', 'wb') as fp :
        pickle.dump(pipeline, fp)

    print('저장완료')

In [65]:
step4_learning()

학습 시작


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


학습 종료
총 학습시간 : 2
정확도 : 0.898
R2 score :  0.38259795409770103
mean_absolute_error :  0.10150583986252294
mean_squared_error :  0.10150583986252294
저장완료
