## Import

In [1]:
import re
import pandas as pd
import numpy as np
import random
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

  from .autonotebook import tqdm as notebook_tqdm


## Random Seed

In [2]:
SEED = 0

np.random.seed(SEED)
random.seed(SEED)

## Load Data

In [3]:
df = pd.read_csv('news.csv')
df.head()

Unnamed: 0,id,title,contents
0,NEWS_00000,Spanish coach facing action in race row,MADRID (AFP) - Spanish national team coach Lui...
1,NEWS_00001,Bruce Lee statue for divided city,"In Bosnia, where one man #39;s hero is often a..."
2,NEWS_00002,Only Lovers Left Alive's Tilda Swinton Talks A...,Yasmine Hamdan performs 'Hal' which she also s...
3,NEWS_00003,Macromedia contributes to eBay Stores,Macromedia has announced a special version of ...
4,NEWS_00004,Qualcomm plans to phone it in on cellular repairs,Over-the-air fixes for cell phones comes to Qu...


In [4]:
# 제목 + 내용
df['text'] = df['title'] + ' : ' + df['contents']
df.head()

Unnamed: 0,id,title,contents,text
0,NEWS_00000,Spanish coach facing action in race row,MADRID (AFP) - Spanish national team coach Lui...,Spanish coach facing action in race row : MADR...
1,NEWS_00001,Bruce Lee statue for divided city,"In Bosnia, where one man #39;s hero is often a...","Bruce Lee statue for divided city : In Bosnia,..."
2,NEWS_00002,Only Lovers Left Alive's Tilda Swinton Talks A...,Yasmine Hamdan performs 'Hal' which she also s...,Only Lovers Left Alive's Tilda Swinton Talks A...
3,NEWS_00003,Macromedia contributes to eBay Stores,Macromedia has announced a special version of ...,Macromedia contributes to eBay Stores : Macrom...
4,NEWS_00004,Qualcomm plans to phone it in on cellular repairs,Over-the-air fixes for cell phones comes to Qu...,Qualcomm plans to phone it in on cellular repa...


## Pre-processing

In [5]:
def preprocess_text(text):
    # URL 제거
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # 해시태그 제거
    text = re.sub(r'#\w+', '', text)
    
    # 멘션 제거
    text = re.sub(r'@\w+', '', text)
    
    # 이모지 제거
    text = text.encode('ascii', 'ignore').decode('ascii')
    
    # 공백 및 특수문자 제거
    text = re.sub(r'\s+', ' ', text).strip()
    
    # 숫자 제거
    text = re.sub(r'\d+', '', text)
    
    return text.lower()

In [6]:
df['processed_text'] = df['text'].apply(preprocess_text)

## Feature Extraction

In [7]:
# Sentence BERT 모델 로드
model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')

# 텍스트 feature 추출
sentence_embeddings = model.encode(df['text'].tolist())

# 추출한 feature를 데이터프레임에 저장
df_embeddings = pd.DataFrame(sentence_embeddings)

Downloading .gitattributes: 100%|██████████| 1.18k/1.18k [00:00<00:00, 588kB/s]
Downloading 1_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 63.3kB/s]
Downloading README.md: 100%|██████████| 10.6k/10.6k [00:00<00:00, 3.54MB/s]
Downloading config.json: 100%|██████████| 573/573 [00:00<00:00, 191kB/s]
Downloading (…)ce_transformers.json: 100%|██████████| 116/116 [00:00<00:00, 58.4kB/s]
Downloading data_config.json: 100%|██████████| 39.3k/39.3k [00:00<00:00, 19.6MB/s]
Downloading pytorch_model.bin: 100%|██████████| 134M/134M [00:01<00:00, 111MB/s] 
Downloading (…)nce_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 26.5kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 56.0kB/s]
Downloading tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 2.37MB/s]
Downloading tokenizer_config.json: 100%|██████████| 352/352 [00:00<00:00, 117kB/s]
Downloading train_script.py: 100%|██████████| 13.2k/13.2k [00:00<00:00, 4.39MB/s]
Downloading vocab.

## Clustering

In [8]:
from sklearn.mixture import GaussianMixture

In [9]:
gaussian_mix = GaussianMixture(n_components=6, random_state=SEED)
df['gaussian_mix'] = gaussian_mix.fit_predict(sentence_embeddings)

## Post-processing

In [10]:
df

Unnamed: 0,id,title,contents,text,processed_text,gaussian_mix
0,NEWS_00000,Spanish coach facing action in race row,MADRID (AFP) - Spanish national team coach Lui...,Spanish coach facing action in race row : MADR...,spanish coach facing action in race row : madr...,1
1,NEWS_00001,Bruce Lee statue for divided city,"In Bosnia, where one man #39;s hero is often a...","Bruce Lee statue for divided city : In Bosnia,...","bruce lee statue for divided city : in bosnia,...",2
2,NEWS_00002,Only Lovers Left Alive's Tilda Swinton Talks A...,Yasmine Hamdan performs 'Hal' which she also s...,Only Lovers Left Alive's Tilda Swinton Talks A...,only lovers left alive's tilda swinton talks a...,2
3,NEWS_00003,Macromedia contributes to eBay Stores,Macromedia has announced a special version of ...,Macromedia contributes to eBay Stores : Macrom...,macromedia contributes to ebay stores : macrom...,0
4,NEWS_00004,Qualcomm plans to phone it in on cellular repairs,Over-the-air fixes for cell phones comes to Qu...,Qualcomm plans to phone it in on cellular repa...,qualcomm plans to phone it in on cellular repa...,0
...,...,...,...,...,...,...
59995,NEWS_59995,"Dolphins Break Through, Rip Rams For First Win",But that #39;s OK. Because after a 31-14 rout ...,"Dolphins Break Through, Rip Rams For First Win...","dolphins break through, rip rams for first win...",1
59996,NEWS_59996,"After Steep Drop, Price of Oil Rises",The freefall in oil prices ended Monday on a s...,"After Steep Drop, Price of Oil Rises : The fre...","after steep drop, price of oil rises : the fre...",3
59997,NEWS_59997,Pro football: Culpepper puts on a show,To say Daunte Culpepper was a little frustrate...,Pro football: Culpepper puts on a show : To sa...,pro football: culpepper puts on a show : to sa...,1
59998,NEWS_59998,Albertsons on the Rebound,The No. 2 grocer reports double-digit gains in...,Albertsons on the Rebound : The No. 2 grocer r...,albertsons on the rebound : the no. grocer re...,3


In [22]:
df[df['gaussian_mix'] == 0]['text'].head(20) #Tech 4

3      Macromedia contributes to eBay Stores : Macrom...
4      Qualcomm plans to phone it in on cellular repa...
5      Thomson to Back Both Blu-ray and HD-DVD : Comp...
23     FTC Files First Lawsuit Against Spyware Concer...
31     Sony PSP Draws Crowds and Lines on First Day (...
41     Photos from MacExpo 2004 : With over 100 exhib...
50     UN Predicts Boom In Robot Labor : The use of r...
52     Amazon's Next Kindle May Have Better Battery L...
73     Delphi, XM Unveil Handheld Satellite Radio Rec...
74     2 Russian Officials Charged In Massive Yahoo H...
79     Suit by Cities Says Microsoft Overcharged : ic...
83     IT Product Guide goes beta : InfoWorld and Ope...
97     Battle of the big games : "Halo 2" for the Xbo...
110    Lan switch sales looking good for 2004 : Despi...
119    Microsoft, Cisco: Seeing Eye-to-Eye on Network...
125    What Apple's U2 Stunt Really Says About the Fu...
136    IBM to use AMD's dual-core Opteron : Upgrade t...
143    E-Card Holiday Virus Pac

In [21]:
df[df['gaussian_mix'] == 1]['text'].head(15) #Sports 3

0     Spanish coach facing action in race row : MADR...
13    GAME DAY PREVIEW Game time: 6:00 PM : CHARLOTT...
22    College Basketball: Georgia Tech, UConn Win : ...
26    Doping case was flawed, report finds : MONTREA...
30    Montgomerie Beats Woods in S. Korean Skins (AP...
38    Longhorns Rip Cowboys : Cedric Benson scores f...
39    Ricky Williams ordered to repay Dolphins : For...
42    Thomas out of Six Nations : Wales captain Gare...
43    Tavarez Breaks Hand After Leaving Game 4 (AP) ...
44    Golf: Cink in command in winning NEC Invitatio...
46    Official: Cuper Signs With Mallorca : Hector C...
48    Stove burns hot in Seattle : The Mariners, who...
72    Up to the Challenge : The Redskins contend tha...
88    Expos Skipper Robinson OKs One-Year Deal (AP) ...
94    Jordan: Ford decision was knife through the he...
Name: text, dtype: object

In [23]:
df[df['gaussian_mix'] == 1]['text'][28:35]

165    F1: Move put on hold : Jenson Button is still ...
175    Finalists Named for O'Brien Award : FORT WORTH...
179    QB controversy? Give Bowden few minutes : Flor...
180    TENNIS NOTEBOOK US team hopes to be more than ...
184    Huskies face history against Cal, Tedford : Th...
189    Consider Spain a Davis Cup certainty : Spain c...
192    James pounds Pistons : If LeBron James was sen...
Name: text, dtype: object

In [24]:
df[df['gaussian_mix'] == 2]['text'].head(15) #Enterntainment 1

1     Bruce Lee statue for divided city : In Bosnia,...
2     Only Lovers Left Alive's Tilda Swinton Talks A...
6     Time to Talk Baseball : It's time to talk abou...
10    Harry #39;s argy-bargy : PRINCE Charles has as...
16    Fischer's Fiancee: Marriage Plans Genuine (AP)...
20    Deere's Color Is Green : With big tractors, bi...
21    Blake Leeper Wants to Be the First American Pa...
24    Kentucky Fan Gets National Champs Tattoo. Let'...
25    Be on TOP : //www.huffingtonpost.com/entry/be-...
28    Cate Blanchett Set To Star As Lucille Ball In ...
37    Deep Impact Space Probe Aims to Slam Into Come...
40    Out for V-I-C-T-O-R-Y, but Missing Tiles : Mis...
45    The Trouble with Broadcasting in a Social Worl...
62    John Waters' Women at the Film Society of Linc...
64    Jon Voight Is 'Concerned' About Daughter Angel...
Name: text, dtype: object

In [26]:
df[df['gaussian_mix'] == 3]['text'].head(15) #Buisness 0

7      Bump Stock Maker Resumes Sales One Month After...
27     Kmart-Sears merger about price, quality : Aver...
49     Bribery Considered, Halliburton Notes Suggest ...
51     Oil Falls Below \$49 on Nigeria Cease-Fire : L...
70     ABN Amro Profit Rises, Buoyed by Sale of Asia ...
85     Stocks to Open Higher on Growth Outlook : NEW ...
93     Oracle wins PeopleSoft takeover case : The lon...
98     Producer Prices Up 0.1 Pct, Energy Drops (Reut...
99     Rigel, Merck Form Development Partnership : NE...
100    GM, DaimlerChrysler to develop hybrid engines ...
102    A Lot Of Managers Want to Raise The Minimum Wa...
103    Nortel attempts to calm fears of possible stoc...
105    Prices climb after bombing spree : LONDON: Wor...
111    Oil rallies to new record high : Crude oil fut...
120    BA cancels 1,000 Heathrow flights : British Ai...
Name: text, dtype: object

In [27]:
df[df['gaussian_mix'] == 4]['text'].head(15) # World 5로 변경

29    Israel Kills 3 Palestinians in Big Gaza Incurs...
56    Sadr #39;s aide denies entering of Iraqi polic...
57    Former Nazi Guard Loses Canadian Court Ruling ...
59    Afghanistan Death Toll in 2004 Up to 957 : KAN...
60    Portugal PM, Cabinet Submit Resignations : LIS...
61    Typhoon-Like Gusts Hit Japan; 13 Injured : TOK...
63    Family appeals for release of UK hostage : The...
69    Fallujah situation  #39;disastrous #39;, chari...
71    Germany Extends Afghan Mission : The German Pa...
75    Report Expected to Blame 24 in Iraq Abuse : WA...
77    Poland to Cut One-Third of Its Troops in Iraq ...
78    Burundi Agrees to Move Tutsi Refugees to Safet...
82    Australia investigating if Iraq body could be ...
86    Colombian militia leader dead : Bogota, Colomb...
87    New Iraq mission for Black Watch : Troops from...
Name: text, dtype: object

In [28]:
df[df['gaussian_mix'] == 5]['text'].head(15) # #Politic 2

8     Obama Marks Anniversary Of 9/11 Attacks With M...
9     Republican Congressman Says Trump Should Apolo...
11    Kerry rolls out tax-cut plan for middle class ...
12    Read Live Updates From The South Carolina Demo...
14    Obama Administration Helps Wall Street Crimina...
15    It's Not As Easy As You Think To Spot A Gerrym...
17    Parents Of School Shooting Victims Decry 'Moro...
18    A Fair Way to Choose Candidates for Republican...
19    Congress Spikes Handout For Private Equity aut...
32    Sunday Show Hosts Hit Back On Trump Administra...
33    Memo To EPA Chief Pruitt : //www.huffingtonpos...
34    The Folly of the Sole Superpower Writ Small au...
35    Is E-Voting Secure? : (CBS) Nearly one third o...
36    Agencies Postpone Issuing New Rules Until Afte...
47    Bush author says White House applying pressure...
Name: text, dtype: object

In [30]:
mapping_dict_gm = {
    0: 4,
    1: 3,
    2: 1,
    3: 0,
    4: 5,
    5: 2
}

In [31]:
df['mapping'] = df['gaussian_mix'].apply(lambda x: mapping_dict_gm[x])

## Submission

In [32]:
sample = pd.read_csv('sample_submission.csv')

In [33]:
sample['category'] = df['mapping'].values
sample['category'].head()

0    3
1    1
2    1
3    4
4    4
Name: category, dtype: int64

In [34]:
sample.to_csv('submit_231129_2.csv', index=False)