## Import

In [1]:
import re
import pandas as pd
import numpy as np
import random
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

  from .autonotebook import tqdm as notebook_tqdm


## Random Seed

In [2]:
SEED = 0

np.random.seed(SEED)
random.seed(SEED)

## Load Data

In [3]:
df = pd.read_csv('news.csv')
df.head()

Unnamed: 0,id,title,contents
0,NEWS_00000,Spanish coach facing action in race row,MADRID (AFP) - Spanish national team coach Lui...
1,NEWS_00001,Bruce Lee statue for divided city,"In Bosnia, where one man #39;s hero is often a..."
2,NEWS_00002,Only Lovers Left Alive's Tilda Swinton Talks A...,Yasmine Hamdan performs 'Hal' which she also s...
3,NEWS_00003,Macromedia contributes to eBay Stores,Macromedia has announced a special version of ...
4,NEWS_00004,Qualcomm plans to phone it in on cellular repairs,Over-the-air fixes for cell phones comes to Qu...


In [4]:
# 제목 + 내용
df['text'] = df['title'] + ' : ' + df['contents']
df.head()

Unnamed: 0,id,title,contents,text
0,NEWS_00000,Spanish coach facing action in race row,MADRID (AFP) - Spanish national team coach Lui...,Spanish coach facing action in race row : MADR...
1,NEWS_00001,Bruce Lee statue for divided city,"In Bosnia, where one man #39;s hero is often a...","Bruce Lee statue for divided city : In Bosnia,..."
2,NEWS_00002,Only Lovers Left Alive's Tilda Swinton Talks A...,Yasmine Hamdan performs 'Hal' which she also s...,Only Lovers Left Alive's Tilda Swinton Talks A...
3,NEWS_00003,Macromedia contributes to eBay Stores,Macromedia has announced a special version of ...,Macromedia contributes to eBay Stores : Macrom...
4,NEWS_00004,Qualcomm plans to phone it in on cellular repairs,Over-the-air fixes for cell phones comes to Qu...,Qualcomm plans to phone it in on cellular repa...


## Pre-processing

In [5]:
def preprocess_text(text):
    # URL 제거
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # 해시태그 제거
    text = re.sub(r'#\w+', '', text)
    
    # 멘션 제거
    text = re.sub(r'@\w+', '', text)
    
    # 이모지 제거
    text = text.encode('ascii', 'ignore').decode('ascii')
    
    # 공백 및 특수문자 제거
    text = re.sub(r'\s+', ' ', text).strip()
    
    # 숫자 제거
    text = re.sub(r'\d+', '', text)
    
    return text.lower()

In [6]:
df['processed_text'] = df['text'].apply(preprocess_text)

## Feature Extraction

In [7]:
# Sentence BERT 모델 로드
model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')

# 텍스트 feature 추출
sentence_embeddings = model.encode(df['text'].tolist())

# 추출한 feature를 데이터프레임에 저장
df_embeddings = pd.DataFrame(sentence_embeddings)

## Clustering

In [8]:
from sklearn.mixture import GaussianMixture

In [9]:
gaussian_mix = GaussianMixture(n_components=6, random_state=SEED)
df['gaussian_mix'] = gaussian_mix.fit_predict(sentence_embeddings)

In [8]:
# Sentence BERT 임베딩을 사용하여 군집화 수행
kmeans = KMeans(n_clusters=6, random_state=SEED)

df['kmeans_cluster'] = kmeans.fit_predict(sentence_embeddings)



## Post-processing

In [14]:
df[df['kmeans_cluster'] == 0]['text'].head(10) # Business 0

7     Bump Stock Maker Resumes Sales One Month After...
19    Congress Spikes Handout For Private Equity aut...
20    Deere's Color Is Green : With big tractors, bi...
27    Kmart-Sears merger about price, quality : Aver...
51    Oil Falls Below \$49 on Nigeria Cease-Fire : L...
70    ABN Amro Profit Rises, Buoyed by Sale of Asia ...
85    Stocks to Open Higher on Growth Outlook : NEW ...
93    Oracle wins PeopleSoft takeover case : The lon...
98    Producer Prices Up 0.1 Pct, Energy Drops (Reut...
99    Rigel, Merck Form Development Partnership : NE...
Name: text, dtype: object

In [15]:
df[df['kmeans_cluster'] == 1]['text'].head(10)

2     Only Lovers Left Alive's Tilda Swinton Talks A...
10    Harry #39;s argy-bargy : PRINCE Charles has as...
16    Fischer's Fiancee: Marriage Plans Genuine (AP)...
21    Blake Leeper Wants to Be the First American Pa...
24    Kentucky Fan Gets National Champs Tattoo. Let'...
25    Be on TOP : //www.huffingtonpost.com/entry/be-...
28    Cate Blanchett Set To Star As Lucille Ball In ...
45    The Trouble with Broadcasting in a Social Worl...
62    John Waters' Women at the Film Society of Linc...
64    Jon Voight Is 'Concerned' About Daughter Angel...
Name: text, dtype: object

In [16]:
df[df['kmeans_cluster'] == 2]['text'].head(10) #Tech 4

3     Macromedia contributes to eBay Stores : Macrom...
4     Qualcomm plans to phone it in on cellular repa...
5     Thomson to Back Both Blu-ray and HD-DVD : Comp...
23    FTC Files First Lawsuit Against Spyware Concer...
31    Sony PSP Draws Crowds and Lines on First Day (...
35    Is E-Voting Secure? : (CBS) Nearly one third o...
40    Out for V-I-C-T-O-R-Y, but Missing Tiles : Mis...
41    Photos from MacExpo 2004 : With over 100 exhib...
50    UN Predicts Boom In Robot Labor : The use of r...
52    Amazon's Next Kindle May Have Better Battery L...
Name: text, dtype: object

In [18]:
df[df['kmeans_cluster'] == 3]['text'].head(10) # World 5

1     Bruce Lee statue for divided city : In Bosnia,...
29    Israel Kills 3 Palestinians in Big Gaza Incurs...
34    The Folly of the Sole Superpower Writ Small au...
37    Deep Impact Space Probe Aims to Slam Into Come...
56    Sadr #39;s aide denies entering of Iraqi polic...
57    Former Nazi Guard Loses Canadian Court Ruling ...
59    Afghanistan Death Toll in 2004 Up to 957 : KAN...
60    Portugal PM, Cabinet Submit Resignations : LIS...
61    Typhoon-Like Gusts Hit Japan; 13 Injured : TOK...
63    Family appeals for release of UK hostage : The...
Name: text, dtype: object

In [19]:
df[df['kmeans_cluster'] == 4]['text'].head(10) # Politics 2

8     Obama Marks Anniversary Of 9/11 Attacks With M...
9     Republican Congressman Says Trump Should Apolo...
11    Kerry rolls out tax-cut plan for middle class ...
12    Read Live Updates From The South Carolina Demo...
14    Obama Administration Helps Wall Street Crimina...
15    It's Not As Easy As You Think To Spot A Gerrym...
17    Parents Of School Shooting Victims Decry 'Moro...
18    A Fair Way to Choose Candidates for Republican...
32    Sunday Show Hosts Hit Back On Trump Administra...
33    Memo To EPA Chief Pruitt : //www.huffingtonpos...
Name: text, dtype: object

### World: 5 -> 5

In [20]:
df[df['kmeans_cluster'] == 5]['text'].head(10)  # 3 Sports

0     Spanish coach facing action in race row : MADR...
6     Time to Talk Baseball : It's time to talk abou...
13    GAME DAY PREVIEW Game time: 6:00 PM : CHARLOTT...
22    College Basketball: Georgia Tech, UConn Win : ...
26    Doping case was flawed, report finds : MONTREA...
30    Montgomerie Beats Woods in S. Korean Skins (AP...
38    Longhorns Rip Cowboys : Cedric Benson scores f...
39    Ricky Williams ordered to repay Dolphins : For...
42    Thomas out of Six Nations : Wales captain Gare...
43    Tavarez Breaks Hand After Leaving Game 4 (AP) ...
Name: text, dtype: object

### Mapping

In [21]:
mapping_dict = {
    0: 0,
    1: 1,
    2: 4,
    3: 5,
    4: 2,
    5: 3
}

In [22]:
df['mapping'] = df['kmeans_cluster'].apply(lambda x: mapping_dict[x])

## Submission

In [23]:
sample = pd.read_csv('sample_submission.csv')

In [24]:
sample['category'] = df['mapping'].values
sample['category'].head()

0    3
1    5
2    1
3    4
4    4
Name: category, dtype: int64

In [34]:
sample.to_csv('submit_231129_1.csv', index=False)