In [None]:
!pip install sentence_transformers

In [2]:
import re
import pandas as pd
import numpy as np
import random
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

import os

In [3]:
current_path = os.getcwd()
print(current_path)

/content


In [4]:
SEED = 0

np.random.seed(SEED)
random.seed(SEED)

In [5]:
drive_path = "/content/drive/MyDrive/데이터 경진대회/뉴스기사  레이블 복구"

In [6]:
data_file_path = drive_path + "/news.csv"
sample_submission_file_path = drive_path + "/sample_submission.csv"
result_submission_file_path = drive_path + "/baseline_submit.csv"

In [7]:
df = pd.read_csv(data_file_path)
df.head()

Unnamed: 0,id,title,contents
0,NEWS_00000,Spanish coach facing action in race row,MADRID (AFP) - Spanish national team coach Lui...
1,NEWS_00001,Bruce Lee statue for divided city,"In Bosnia, where one man #39;s hero is often a..."
2,NEWS_00002,Only Lovers Left Alive's Tilda Swinton Talks A...,Yasmine Hamdan performs 'Hal' which she also s...
3,NEWS_00003,Macromedia contributes to eBay Stores,Macromedia has announced a special version of ...
4,NEWS_00004,Qualcomm plans to phone it in on cellular repairs,Over-the-air fixes for cell phones comes to Qu...


In [8]:
df["text"] = df["title"] + ":" + df["contents"]
df.head()

Unnamed: 0,id,title,contents,text
0,NEWS_00000,Spanish coach facing action in race row,MADRID (AFP) - Spanish national team coach Lui...,Spanish coach facing action in race row:MADRID...
1,NEWS_00001,Bruce Lee statue for divided city,"In Bosnia, where one man #39;s hero is often a...","Bruce Lee statue for divided city:In Bosnia, w..."
2,NEWS_00002,Only Lovers Left Alive's Tilda Swinton Talks A...,Yasmine Hamdan performs 'Hal' which she also s...,Only Lovers Left Alive's Tilda Swinton Talks A...
3,NEWS_00003,Macromedia contributes to eBay Stores,Macromedia has announced a special version of ...,Macromedia contributes to eBay Stores:Macromed...
4,NEWS_00004,Qualcomm plans to phone it in on cellular repairs,Over-the-air fixes for cell phones comes to Qu...,Qualcomm plans to phone it in on cellular repa...


In [9]:
def preprocess_text(text):
    # URL 제거
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # 해시태그 제거
    text = re.sub(r'#\w+', '', text)

    # 멘션 제거
    text = re.sub(r'@\w+', '', text)

    # 이모지 제거
    text = text.encode('ascii', 'ignore').decode('ascii')

    # 공백 및 특수문자 제거
    text = re.sub(r'\s+', ' ', text).strip()

    # 숫자 제거
    text = re.sub(r'\d+', '', text)

    return text.lower()

In [10]:
df['text'][:5].tolist()

['Spanish coach facing action in race row:MADRID (AFP) - Spanish national team coach Luis Aragones faces a formal investigation after Spain #39;s Football Federation decided to open disciplinary proceedings over racist comments about Thierry Henry of France and Arsenal.',
 'Bruce Lee statue for divided city:In Bosnia, where one man #39;s hero is often another man #39;s villain, some citizens have decided to honour one whom Serbs, Croats and Muslims can all look up to - the kung fu great Bruce Lee.',
 "Only Lovers Left Alive's Tilda Swinton Talks About Almost Quitting Acting and Yasmine Hamdan Performs 'Hal' Live In NYC   (HuffPo Exclusive Videos) authors:Yasmine Hamdan performs 'Hal' which she also sings in the film during a scene when two world-weary vampires begin to heal and find a way to continue living as they remember the power and mystery of creation itself.",
 'Macromedia contributes to eBay Stores:Macromedia has announced a special version of its Contribute website editing app

In [11]:
df["processed_text"] = df["text"].apply(preprocess_text)

In [None]:
model = SentenceTransformer('paraphrase-distilroberta-base-v1')
sentence_embeddings = model.encode(df['text'].tolist())
df_embeddings = pd.DataFrame(sentence_embeddings)

In [None]:
kmeans = KMeans(n_clusters=6, random_state=SEED)
df['kmeans_cluster'] = kmeans.fit_predict(sentence_embeddings)

In [14]:
df[df['kmeans_cluster'] == 0]['text'].head(3)

1     Bruce Lee statue for divided city:In Bosnia, w...
10    Harry #39;s argy-bargy:PRINCE Charles has aske...
14    Obama Administration Helps Wall Street Crimina...
Name: text, dtype: object

In [15]:
df[df['kmeans_cluster'] == 1]['text'].head(3)

0     Spanish coach facing action in race row:MADRID...
13    GAME DAY PREVIEW Game time: 6:00 PM:CHARLOTTE,...
22    College Basketball: Georgia Tech, UConn Win:AT...
Name: text, dtype: object

In [16]:
df[df['kmeans_cluster'] == 2]['text'].head(3)

11    Kerry rolls out tax-cut plan for middle class:...
20    Deere's Color Is Green:With big tractors, big ...
39    Ricky Williams ordered to repay Dolphins:Forme...
Name: text, dtype: object

In [17]:
df[df['kmeans_cluster'] == 3]['text'].head(3)

2    Only Lovers Left Alive's Tilda Swinton Talks A...
6    Time to Talk Baseball:It's time to talk about ...
7    Bump Stock Maker Resumes Sales One Month After...
Name: text, dtype: object

In [18]:
df[df['kmeans_cluster'] == 4]['text'].head(3)

18    A Fair Way to Choose Candidates for Republican...
25    Be on TOP://www.huffingtonpost.com/entry/be-on...
33    Memo To EPA Chief Pruitt://www.huffingtonpost....
Name: text, dtype: object

In [19]:
df[df['kmeans_cluster'] == 5]['text'].head(3)

3    Macromedia contributes to eBay Stores:Macromed...
4    Qualcomm plans to phone it in on cellular repa...
5    Thomson to Back Both Blu-ray and HD-DVD:Compan...
Name: text, dtype: object

In [None]:
mapping_dict = {
    0: 1,
    1: 3,
    2: 0,
    3: 2,
    4: 5,
    5: 4
}

In [None]:
df['mapping'] = df["kmeans_cluster"].apply(lambda x : mapping_dict[x])

In [None]:
sample = pd.read_csv(sample_submission_file_path)

In [None]:
sample['category'] = df['mapping'].values

In [None]:
sample.to_csv(result_submission_file_path, index=False)