In [32]:
from google.colab import drive
drive.mount("/content/drive")

import collections
import numpy as np
import pandas as pd
import re

from argparse import Namespace

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [33]:
base_path = "/content/drive/MyDrive/Github_NLP/NLP_with_PyTorch/Chapter5_Words_And_TypeEmbedding/"
args = Namespace(
    raw_dataset_csv = base_path + "data/ag_news/news.csv",
    train_proportion = 0.7,
    val_proportion = 0.15,
    test_proportion = 0.15,
    output_munged_csv = base_path + "data/ag_news/news_with_splits.csv",
    seed = 1337
)

In [5]:
# 만약 코랩에서 실행하는 경우 아래 코드를 실행하여 전처리된 라이트 버전의 데이터를 다운로드하세요.
!wget https://git.io/Jt1NH -O /content/drive/MyDrive/Github_NLP/NLP_with_PyTorch/Chapter5_Words_And_TypeEmbedding/data/download.py
!wget https://git.io/Jt1NS -O /content/drive/MyDrive/Github_NLP/NLP_with_PyTorch/Chapter5_Words_And_TypeEmbedding/data/get-all-data.sh
!chmod 755 /content/drive/MyDrive/Github_NLP/NLP_with_PyTorch/Chapter5_Words_And_TypeEmbedding/data/get-all-data.sh
%cd /content/drive/MyDrive/Github_NLP/NLP_with_PyTorch/Chapter5_Words_And_TypeEmbedding/data
!./get-all-data.sh
%cd ..

--2024-03-16 03:38:45--  https://git.io/Jt1NH
Resolving git.io (git.io)... 140.82.113.22
Connecting to git.io (git.io)|140.82.113.22|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://raw.githubusercontent.com/rickiepark/nlp-with-pytorch/main/chapter_5/5_3_doc_classification/data/download.py [following]
--2024-03-16 03:38:45--  https://raw.githubusercontent.com/rickiepark/nlp-with-pytorch/main/chapter_5/5_3_doc_classification/data/download.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1572 (1.5K) [text/plain]
Saving to: ‘/content/drive/MyDrive/Github_NLP/NLP_with_PyTorch/Chapter5_Words_And_TypeEmbedding/data/download.py’


2024-03-16 03:38:45 (3.86 MB/s) - ‘/content/drive/MyDrive/Github_NLP/NLP_with_PyTorc

In [34]:
news = pd.read_csv(args.raw_dataset_csv, header=0)

In [35]:
print(news.head())

   category                                              title
0  Business  Wall St. Bears Claw Back Into the Black (Reuters)
1  Business  Carlyle Looks Toward Commercial Aerospace (Reu...
2  Business    Oil and Economy Cloud Stocks' Outlook (Reuters)
3  Business  Iraq Halts Oil Exports from Main Southern Pipe...
4  Business  Oil prices soar to all-time record, posing new...


In [36]:
print(set(news.category))

{'World', 'Business', 'Sports', 'Sci/Tech'}


In [37]:
by_category = collections.defaultdict(list)
for _, row in news.iterrows():
  by_category[row.category].append(row.to_dict())

In [38]:
print(len(by_category))

4


In [39]:
for key, value in by_category.items():
  print(key, value[:5])

"""
Dictionary Structure

"Business" : [{"category": "Business", "title": "~~~"}, {}, {}]
...
"World" : [{}, {}, {}]

"""

Business [{'category': 'Business', 'title': 'Wall St. Bears Claw Back Into the Black (Reuters)'}, {'category': 'Business', 'title': 'Carlyle Looks Toward Commercial Aerospace (Reuters)'}, {'category': 'Business', 'title': "Oil and Economy Cloud Stocks' Outlook (Reuters)"}, {'category': 'Business', 'title': 'Iraq Halts Oil Exports from Main Southern Pipeline (Reuters)'}, {'category': 'Business', 'title': 'Oil prices soar to all-time record, posing new menace to US economy (AFP)'}]
Sci/Tech [{'category': 'Sci/Tech', 'title': "'Madden,' 'ESPN' Football Score in Different Ways (Reuters)"}, {'category': 'Sci/Tech', 'title': 'Group to Propose New High-Speed Wireless Format (Reuters)'}, {'category': 'Sci/Tech', 'title': 'AOL to Sell Cheap PCs to Minorities and Seniors (Reuters)'}, {'category': 'Sci/Tech', 'title': 'Companies Approve New High-Capacity Disc Format (Reuters)'}, {'category': 'Sci/Tech', 'title': 'Missing June Deals Slow to Return for Software Cos. (Reuters)'}]
Sports [{'category'

'\nDictionary Structure\n\n"Business" : [{"category": "Business", "title": "~~~"}, {}, {}]\n...\n"World" : [{}, {}, {}]\n\n'

In [40]:
final_list = []
np.random.seed(args.seed)

#sorted(by_category.items()) --> [(),(),()] transform inner tuples in outer list
for _, item_list in sorted(by_category.items()):
  np.random.shuffle(item_list)
  n = len(item_list)
  n_train = int(args.train_proportion * n)
  n_val = int(args.val_proportion * n)
  n_test = int(args.test_proportion * n)

  for item in item_list[:n_train]:
    item["split"] = "train"
  for item in item_list[n_train:n_train + n_val]:
    item["split"] = "val"
  for item in item_list[n_train+n_val:]:
    item["split"] = "test"
  # extend -> 리스트를 통째로 넣는게 아니라 iterable 객체내의 항목을 추가함.
  final_list.extend(item_list)

In [41]:
print(final_list[:5])

#final List Structure
"""
[
  {"category": "~~", "title": "~~", "split": "~~"},
  {},
  {},
  ...
  {}
]
"""

[{'category': 'Business', 'title': 'Jobs, tax cuts key issues for Bush', 'split': 'train'}, {'category': 'Business', 'title': 'Jarden Buying Mr. Coffee #39;s Maker', 'split': 'train'}, {'category': 'Business', 'title': 'Retail sales show festive fervour', 'split': 'train'}, {'category': 'Business', 'title': "Intervoice's Customers Come Calling", 'split': 'train'}, {'category': 'Business', 'title': 'Boeing Expects Air Force Contract', 'split': 'train'}]


'\n[\n  {"category": "~~", "title": "~~", "split": "~~"},\n  {},\n  {},\n  ...\n  {}\n]\n'

In [42]:
final_news = pd.DataFrame(final_list)

In [44]:
print(final_news.head())
print(final_news.tail())

   category                                 title  split
0  Business    Jobs, tax cuts key issues for Bush  train
1  Business  Jarden Buying Mr. Coffee #39;s Maker  train
2  Business     Retail sales show festive fervour  train
3  Business   Intervoice's Customers Come Calling  train
4  Business     Boeing Expects Air Force Contract  train
       category                                              title split
119995    World          Genesis Space Capsule Crashes Into Desert  test
119996    World           U.S.: Too Early to Tell Iraq Unit's Fate  test
119997    World                 AFGHAN OPIUM GROWING UP TWO THIRDS  test
119998    World  At least one Saudi policeman killed in clashes...  test
119999    World                 U.S. Forces Claim Most of Fallujah  test


In [45]:
final_news.split.value_counts()

train    84000
val      18000
test     18000
Name: split, dtype: int64

In [47]:
# Preprocess the reviews
def preprocess_text(text):
    text = ' '.join(word.lower() for word in text.split(" "))
    text = re.sub(r"([.,!?])", r" \1 ", text)
    #여러칸의 공백을 한칸으로 줄임.
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text

final_news.title = final_news.title.apply(preprocess_text)

In [61]:
for i in range(30):
  print(final_news.iloc[i].title)

jobs , tax cuts key issues for bush
jarden buying mr . coffee s maker
retail sales show festive fervour
intervoice s customers come calling
boeing expects air force contract
stocks open near unchanged boeing up
stocks up , data shows inflation in check
jetstar boosts qantas turnover
md . school to hand out blackberry devices
kpmg settles lernout hauspie lawsuit
inmates in charge of travelzoo
wiseman dairies in talks to sell stake
yukos unit may be sold for bn
reasons to like four seasons
dot orders fedex to repay million
dollar is calm before payrolls storm
fed seen raising key rate . 
crude oil futures rise for third straight day
fcc expected to keep states off voip s back
ceo eisner to step down in sept wsj reuters 
snow tells europe to boost growth
euro s flight turns up pressure on ecb to act afp 
crude oil prices top us a barrel
opec warns oil supply cuts needed for 
renton online business sold for million
dollar holds fire ahead of fed
us internet ad revenues up . percent to reco

In [63]:
final_news.to_csv(args.output_munged_csv, index=False)

In [66]:
df = pd.read_csv(base_path + "data/ag_news/news_with_splits.csv")
print(df.head())

   category                                title  split
0  Business  jobs , tax cuts key issues for bush  train
1  Business    jarden buying mr . coffee s maker  train
2  Business    retail sales show festive fervour  train
3  Business  intervoice s customers come calling  train
4  Business    boeing expects air force contract  train
