# LogisticRegression_TF-IDF

In [1]:
import numpy as np
import pandas as pd
import re
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split # 데이터셋 분류
from sklearn.linear_model import LogisticRegression # 로지스틱 모델 선언
from sklearn.metrics import accuracy_score
import warnings

warnings.filterwarnings(action='ignore')

RANDOM_SEED = 32
TEST_SPLIT = 0.2

DATA_IN_PATH='./data_in/'

TRAIN_CLEAN_DATA='train_clean.csv'
TEST_CLEAN_DATA='test_clean.csv'

In [2]:
train_data = pd.read_csv( DATA_IN_PATH + TRAIN_CLEAN_DATA, names=['sentence','label'] )
sentences = list(train_data['sentence'])
labels = list(train_data['label'])

In [12]:
vectorizer = TfidfVectorizer(min_df = 0.0, analyzer="char", sublinear_tf=True, ngram_range=(1,3), max_features=5000) 

X = vectorizer.fit_transform(sentences)
y = np.array(labels)

[' ',
 " '",
 " '!",
 ' \'"',
 " '(",
 " ')",
 " ',",
 " '-",
 " '.",
 " '1",
 " '2",
 " '3",
 " '4",
 " '5",
 " '6",
 " '7",
 " '8",
 " '9",
 " ';",
 " '=",
 " '?",
 " '^",
 " '`",
 " 'a",
 " 'b",
 " 's",
 " '~",
 " '가",
 " '각",
 " '간",
 " '갈",
 " '감",
 " '갑",
 " '강",
 " '갖",
 " '같",
 " '개",
 " '걍",
 " '거",
 " '걱",
 " '건",
 " '걸",
 " '검",
 " '게",
 " '겠",
 " '결",
 " '경",
 " '계",
 " '고",
 " '곧",
 " '곳",
 " '공",
 " '과",
 " '관",
 " '광",
 " '괜",
 " '교",
 " '구",
 " '국",
 " '군",
 " '궁",
 " '권",
 " '귀",
 " '규",
 " '그",
 " '극",
 " '근",
 " '글",
 " '금",
 " '급",
 " '기",
 " '길",
 " '김",
 " '까",
 " '깔",
 " '깜",
 " '깨",
 " '꺼",
 " '께",
 " '꼬",
 " '꼭",
 " '꼴",
 " '꾸",
 " '끄",
 " '끝",
 " '끼",
 " '나",
 " '난",
 " '날",
 " '남",
 " '낫",
 " '낮",
 " '내",
 " '냐",
 " '너",
 " '넌",
 " '넘",
 " '넣",
 " '네",
 " '녀",
 " '년",
 " '노",
 " '논",
 " '놀",
 " '놈",
 " '높",
 " '놓",
 " '뇌",
 " '누",
 " '눈",
 " '뉴",
 " '느",
 " '는",
 " '늘",
 " '능",
 " '늦",
 " '니",
 " '님",
 " '다",
 " '단",
 " '달",
 " '담",
 " '답",
 " '당",
 " '대",
 "

In [4]:
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=TEST_SPLIT, random_state=RANDOM_SEED)

In [5]:
lgs = LogisticRegression(class_weight='balanced') 
lgs.fit(X_train, y_train) 

LogisticRegression(class_weight='balanced')

In [6]:
predicted = lgs.predict(X_eval)
print(predicted)
print("Accuracy: %f" % lgs.score(X_eval, y_eval))

['1' '0' '1' ... '0' '1' '0']
Accuracy: 0.977576


In [14]:
test_data = pd.read_csv(DATA_IN_PATH+TEST_CLEAN_DATA, header = 0)
testDataVecs = vectorizer.transform(test_data['sentence'])
test_predicted = lgs.predict(testDataVecs)
print(test_predicted)
#accuracy_score(testDataVecs,test_predicted)
print(lgs.predict_proba(testDataVecs))
print("Accuracy: %f" % lgs.score(testDataVecs, test_predicted))

['1' '1' '1' ... '1' '1' '0']
[[0.13376833 0.86411527 0.0021164 ]
 [0.18380363 0.81363985 0.00255653]
 [0.08750535 0.91061343 0.00188122]
 ...
 [0.46264607 0.53450261 0.00285133]
 [0.33393633 0.66296025 0.00310342]
 [0.92377137 0.07478596 0.00144268]]
Accuracy: 0.000000


In [25]:
# test_data = pd.read_csv(DATA_IN_PATH+TEST_CLEAN_DATA, header = 0)
# sentence = list(test_data['sentence'])
# label = list(test_data['label'])

# X = vectorizer.fit_transform(sentence)
# y = np.array(label)

# X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=TEST_SPLIT, random_state=RANDOM_SEED)

# lgs.fit(X_train, y_train) 

# predicted2 = lgs.predict(X_eval)
# #print(test_predicted)
# print("Accuracy: %f" % lgs.score(X_eval, y_eval))

Accuracy: 0.976349


In [None]:
# DATA_OUT_PATH = './data_out/'

# if not os.path.exists(DATA_OUT_PATH):
#     os.makedirs(DATA_OUT_PATH)
    
# ids = list(test_data['id'])
# answer_dataset = pd.DataFrame({'id' : ids, "label" : test_predicted})
# answer_dataset.to_csv(DATA_OUT_PATH + 'lgs_tfidf_answer.csv', index=False, quoting=3)