### <기사 본문> 
#### bait 기사의 top50 단어가 기사 본문의 weight top20 단어들에서 몇% 찾아지는가 
* top_10_words -> 상관관계 : 21
* top_20_words / top_20_words -> 상관관계: 29

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Step 1: TF-IDF로 각 기사에서 단어 가중치 계산
vectorizer = TfidfVectorizer(max_features=1000)  
tfidf_matrix = vectorizer.fit_transform(data['int_Content'])
feature_names = vectorizer.get_feature_names_out()

# Step 2: 각 기사마다 weight가 큰 단어들 Top 10 추출
def get_top_words(row, feature_names, top_n=20):
    row_tfidf = row.toarray().flatten()
    top_indices = row_tfidf.argsort()[-top_n:][::-1]
    return [feature_names[i] for i in top_indices]

data['top_20_words'] = [get_top_words(tfidf_matrix[i], feature_names) for i in range(tfidf_matrix.shape[0])]

# Step 3: Clickbait 기사(값: 0)에서 자주 등장한 단어 Top 50 추출
clickbait_articles = data[data['clickbaitClass'] == 0]  # 'label'은 clickbait 여부 열의 이름
clickbait_matrix = vectorizer.transform(clickbait_articles['int_Content'])
clickbait_word_sums = clickbait_matrix.sum(axis=0).A1
top_50_clickbait_words = [feature_names[i] for i in clickbait_word_sums.argsort()[-50:][::-1]]

# Step 4: 각 기사에서 Top 20 단어 중 얼마나 많은 단어가 Clickbait 상위 50 단어에 포함되는지 계산
def calculate_overlap_percentage(top_words, clickbait_words):
    overlap = len(set(top_words) & set(clickbait_words))
    return (overlap / len(top_words)) * 100 if top_words else 0

data['bait_top50_percentage'] = data['top_20_words'].apply(lambda x: calculate_overlap_percentage(x, top_50_clickbait_words))

# 결과 확인
data[['int_Content', 'top_20_words', 'bait_top50_percentage']].head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set 스타일
sns.set(style="whitegrid")

# 히스토그램으로 분포 비교
plt.figure(figsize=(12, 6))
sns.histplot(data, x='bait_top50_percentage', hue='clickbaitClass', kde=True, bins=30, palette='Set2', alpha=0.7)
plt.title('Distribution of bait_top50_percentage by Clickbait Class')
plt.xlabel('Percentage of Top 50 Words Overlap')
plt.ylabel('Frequency')
plt.legend(title='Clickbait Class', labels=['Non-Clickbait (0)', 'Clickbait (1)'])
plt.show()