In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import nltk
import pickle
import json

# 작업 진행 상황 보고 함수
def report_progress(message):
    print(f"[INFO] {message}")

# NLTK 불용어 다운로드
nltk.download('stopwords')
nltk.download('punkt')

# 데이터 파일 경로
file_path = '/Users/eunma/Documents/GitHub/kakaobootcamp/personal mission/NLP pipeline/data/Reviews.csv'

# 데이터 로드
report_progress("Loading data...")
data = pd.read_csv(file_path)
report_progress("Data loaded successfully.")

# 데이터 탐색
report_progress("Exploring data...")
print(data.head())
print(data.info())

# 데이터 선택 (필요한 열만 사용)
data = data[['Text', 'Score']]
report_progress("Selected necessary columns.")

# 결측값 제거
data.dropna(inplace=True)
report_progress("Removed missing values.")

# 리뷰 텍스트 전처리 함수
def preprocess_text(text):
    text = text.lower()  # 소문자 변환
    text = re.sub(r'\d+', '', text)  # 숫자 제거
    text = re.sub(r'[^\w\s]', '', text)  # 구두점 제거
    tokens = word_tokenize(text)  # 토큰화
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # 불용어 제거
    return ' '.join(tokens)

# 텍스트 전처리 적용
report_progress("Preprocessing text...")
data['processed_text'] = data['Text'].apply(preprocess_text)
print(data.head())
report_progress("Text preprocessing completed.")

# TF-IDF 벡터화
report_progress("Vectorizing text using TF-IDF...")
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(data['processed_text'])
y = data['Score']
report_progress("Text vectorization completed.")

# 레이블 이진화 (positive: 4, 5, negative: 1, 2, 3)
y = y.apply(lambda x: 1 if x > 3 else 0)

# 데이터 분할
report_progress("Splitting data into train and test sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
report_progress("Data splitting completed.")

# 하이퍼파라미터 그리드 설정
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # 정규화 강도
    'penalty': ['l1', 'l2'],       # 페널티 유형
    'solver': ['liblinear']        # 최적화 알고리즘
}

# 로지스틱 회귀 모델 초기화
log_reg = LogisticRegression(max_iter=1000)

# 그리드 서치 설정
report_progress("Starting grid search for hyperparameter tuning...")
grid_search = GridSearchCV(estimator=log_reg, param_grid=param_grid, cv=5, n_jobs=-1, scoring='accuracy')

# 그리드 서치 학습
grid_search.fit(X_train, y_train)
report_progress("Grid search completed.")

# 최적의 하이퍼파라미터 출력
print("Best Hyperparameters:", grid_search.best_params_)

# 최적 모델로 예측
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# 성능 평가
report_progress("Evaluating model performance...")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# 학습된 모델 저장
model_filename = 'best_logistic_regression_model.pkl'
with open(model_filename, 'wb') as file:
    pickle.dump(best_model, file)
report_progress(f"Trained model saved as {model_filename}.")

# 데이터 저장
data_filename = 'processed_data.json'
data.to_json(data_filename, orient='records', lines=True)
report_progress(f"Processed data saved as {data_filename}.")

[nltk_data] Downloading package stopwords to /Users/eunma/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/eunma/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[INFO] Loading data...
[INFO] Data loaded successfully.
[INFO] Exploring data...
   Id   ProductId          UserId                      ProfileName  \
0   1  B001E4KFG0  A3SGXH7AUHU8GW                       delmartian   
1   2  B00813GRG4  A1D87F6ZCVE5NK                           dll pa   
2   3  B000LQOCH0   ABXLMWJIXXAIN  Natalia Corres "Natalia Corres"   
3   4  B000UA0QIQ  A395BORC6FGVXV                             Karl   
4   5  B006K2ZZ7K  A1UQRSCLF8GW1T    Michael D. Bigham "M. Wassir"   

   HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \
0                     1                       1      5  1303862400   
1                     0                       0      1  1346976000   
2                     1                       1      4  1219017600   
3                     3                       3      2  1307923200   
4                     0                       0      5  1350777600   

                 Summary                                               Text  