In [None]:

import pandas as pd
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import re
from collections import Counter
import numpy as np
import os
from nltk.corpus import stopwords
import nltk
from konlpy.tag import Okt
import scipy.stats as stats
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, roc_curve, f1_score, auc
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import hstack, csr_matrix

## 2. 데이터 로드 및 전처리

# JSON 데이터 로드 함수
def read_json(folder_path):
    dfs_source = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.json'):
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, 'r') as file:
                json_data = json.load(file)
                dfs_source.append({
                    "newsTitle": json_data["sourceDataInfo"]["newsTitle"],
                    "newsContent": json_data["sourceDataInfo"]["newsContent"],
                    "clickbaitClass": json_data["sourceDataInfo"]["useType"]
                })
    return pd.DataFrame(dfs_source)

folder_paths = [
    r'.\Training\02.라벨링데이터\TL_Part1_Clickbait_Auto_SO',
    r'.\Training\02.라벨링데이터\TL_Part1_Clickbait_Direct_SO',
    r'.\Training\02.라벨링데이터\TL_Part1_NonClickbait_Auto_SO'
]

# 데이터 통합
frames = [read_json(folder) for folder in folder_paths]
df_news = pd.concat(frames)

# 한국어 불용어 로드
def load_stopwords(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read().splitlines()

stopwords_path = "stopwords-ko.txt"
korean_stopwords = load_stopwords(stopwords_path)

# 텍스트 전처리 함수
def preprocess_text(text, stop_words):
    text = re.sub(r'[^가-힣\s]', '', text)
    tokens = [word for word in text.split() if word not in stop_words]
    return ' '.join(tokens)

# 제목과 본문 전처리
df_news['cleanedTitle'] = df_news['newsTitle'].apply(lambda x: preprocess_text(x, korean_stopwords))
df_news['cleanedContent'] = df_news['newsContent'].apply(lambda x: preprocess_text(x, korean_stopwords))

# 텍스트 길이 추가
df_news['titleLength'] = df_news['cleanedTitle'].apply(len)
df_news['contentLength'] = df_news['cleanedContent'].apply(len)


## 3. 텍스트 분석 및 통계


# TF-IDF 벡터화 및 유사도 계산
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df_news['cleanedContent'])
## 4. 모델 학습 및 평가

# TF-IDF와 추가 feature 결합
numeric_features = df_news[['titleLength', 'contentLength']].values
tfidf_numeric_features = hstack((tfidf_matrix, numeric_features))


# 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(tfidf_numeric_features, df_news['clickbaitClass'], test_size=0.2, random_state=42)