In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
import os

from argparse import Namespace
import collections
import nltk.data
import numpy as np
import pandas as pd
import re
import string
from tqdm import tqdm_notebook

In [3]:
base_path = "/content/drive/MyDrive/Github_NLP/NLP_with_PyTorch/Chapter5_Words_And_TypeEmbedding/data/"
args = Namespace(
    raw_dataset_txt=base_path + "/books/frankenstein.txt",
    window_size=5,
    train_proportion=0.7,
    val_proportion=0.15,
    test_proportion=0.15,
    output_munged_csv=base_path + "/books/frankenstein_with_splits.csv",
    seed=1337
)

In [4]:
# 만약 코랩에서 실행하는 경우 아래 코드를 실행하여 전처리된 라이트 버전의 데이터를 다운로드하세요.
!mkdir data
!wget https://git.io/JtX5A -O /content/drive/MyDrive/Github_NLP/NLP_with_PyTorch/Chapter5_Words_And_TypeEmbedding/data/download.py
!wget https://git.io/JtX5F -O /content/drive/MyDrive/Github_NLP/NLP_with_PyTorch/Chapter5_Words_And_TypeEmbedding/data/get-all-data.sh
!chmod 755 /content/drive/MyDrive/Github_NLP/NLP_with_PyTorch/Chapter5_Words_And_TypeEmbedding/data/get-all-data.sh


--2024-03-15 07:58:29--  https://git.io/JtX5A
Resolving git.io (git.io)... 140.82.113.22
Connecting to git.io (git.io)|140.82.113.22|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://raw.githubusercontent.com/rickiepark/nlp-with-pytorch/main/chapter_5/5_2_CBOW/data/download.py [following]
--2024-03-15 07:58:29--  https://raw.githubusercontent.com/rickiepark/nlp-with-pytorch/main/chapter_5/5_2_CBOW/data/download.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1572 (1.5K) [text/plain]
Saving to: ‘/content/drive/MyDrive/Github_NLP/NLP_with_PyTorch/Chapter5_Words_And_TypeEmbedding/data/download.py’


2024-03-15 07:58:29 (6.55 MB/s) - ‘/content/drive/MyDrive/Github_NLP/NLP_with_PyTorch/Chapter5_Words_And_TypeEmb

In [5]:
%cd /content/drive/MyDrive/Github_NLP/NLP_with_PyTorch/Chapter5_Words_And_TypeEmbedding/data
!./get-all-data.sh
%cd ..

/content/drive/MyDrive/Github_NLP/NLP_with_PyTorch/Chapter5_Words_And_TypeEmbedding/data
/content/drive/MyDrive/Github_NLP/NLP_with_PyTorch/Chapter5_Words_And_TypeEmbedding


In [6]:
!pip install nltk
import nltk
nltk.download('punkt')
# Split the raw text book into sentences
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
with open(args.raw_dataset_txt) as fp:
    book = fp.read()
sentences = tokenizer.tokenize(book)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [7]:
print (len(sentences), "sentences")
print ("Sample:", sentences[100])

3427 sentences
Sample: No incidents have hitherto befallen us that would make a figure in a
letter.


In [8]:
# Clean sentences
def preprocess_text(text):
    text = ' '.join(word.lower() for word in text.split(" "))
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text

In [9]:
cleaned_sentences = [preprocess_text(sentence) for sentence in sentences]

In [10]:
for i, tokens in enumerate(cleaned_sentences):
  if i <= 5:
    print(tokens)
  else:
    break

frankenstein , or the modern prometheus by mary wollstonecraft godwin shelley letter st . petersburgh , dec . th , to mrs . saville , england you will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings . 
i arrived here yesterday , and my first task is to assure my dear sister of my welfare and increasing confidence in the success of my undertaking . 
i am already far north of london , and as i walk in the streets of petersburgh , i feel a cold northern breeze play upon my cheeks , which braces my nerves and fills me with delight . 
do you understand this feeling ? 
this breeze , which has travelled from the regions towards which i am advancing , gives me a foretaste of those icy climes . 
inspirited by this wind of promise , my daydreams become more fervent and vivid . 


In [11]:
# Global vars
MASK_TOKEN = "<MASK>"

In [12]:
x = [list(nltk.ngrams([MASK_TOKEN] * args.window_size + sentence.split(' ') + \
    [MASK_TOKEN] * args.window_size, args.window_size * 2 + 1)) \
    for sentence in tqdm_notebook(cleaned_sentences)]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for sentence in tqdm_notebook(cleaned_sentences)]


  0%|          | 0/3427 [00:00<?, ?it/s]

In [13]:
print(x[0])

[('<MASK>', '<MASK>', '<MASK>', '<MASK>', '<MASK>', 'frankenstein', ',', 'or', 'the', 'modern', 'prometheus'), ('<MASK>', '<MASK>', '<MASK>', '<MASK>', 'frankenstein', ',', 'or', 'the', 'modern', 'prometheus', 'by'), ('<MASK>', '<MASK>', '<MASK>', 'frankenstein', ',', 'or', 'the', 'modern', 'prometheus', 'by', 'mary'), ('<MASK>', '<MASK>', 'frankenstein', ',', 'or', 'the', 'modern', 'prometheus', 'by', 'mary', 'wollstonecraft'), ('<MASK>', 'frankenstein', ',', 'or', 'the', 'modern', 'prometheus', 'by', 'mary', 'wollstonecraft', 'godwin'), ('frankenstein', ',', 'or', 'the', 'modern', 'prometheus', 'by', 'mary', 'wollstonecraft', 'godwin', 'shelley'), (',', 'or', 'the', 'modern', 'prometheus', 'by', 'mary', 'wollstonecraft', 'godwin', 'shelley', 'letter'), ('or', 'the', 'modern', 'prometheus', 'by', 'mary', 'wollstonecraft', 'godwin', 'shelley', 'letter', 'st'), ('the', 'modern', 'prometheus', 'by', 'mary', 'wollstonecraft', 'godwin', 'shelley', 'letter', 'st', '.'), ('modern', 'promethe

In [14]:
# Create windows
flatten = lambda outer_list: [item for inner_list in outer_list for item in inner_list]
windows = flatten([list(nltk.ngrams([MASK_TOKEN] * args.window_size + sentence.split(' ') + \
    [MASK_TOKEN] * args.window_size, args.window_size * 2 + 1)) \
    for sentence in tqdm_notebook(cleaned_sentences)])

print(windows[0])
print(windows[1])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for sentence in tqdm_notebook(cleaned_sentences)])


  0%|          | 0/3427 [00:00<?, ?it/s]

('<MASK>', '<MASK>', '<MASK>', '<MASK>', '<MASK>', 'frankenstein', ',', 'or', 'the', 'modern', 'prometheus')
('<MASK>', '<MASK>', '<MASK>', '<MASK>', 'frankenstein', ',', 'or', 'the', 'modern', 'prometheus', 'by')


In [15]:
# Create cbow data
data = []
for window in tqdm_notebook(windows):
    target_token = window[args.window_size]
    context = []
    for i, token in enumerate(window):
        if token == MASK_TOKEN or i == args.window_size:
            continue
        else:
            context.append(token)
    data.append([' '.join(token for token in context), target_token])


# Convert to dataframe
cbow_data = pd.DataFrame(data, columns=["context", "target"])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for window in tqdm_notebook(windows):


  0%|          | 0/90698 [00:00<?, ?it/s]

In [17]:
"""data형태
[
  ["문맥1", "target 문자열1"],
  ["문맥2", "target 문자열2"],
  ["문맥3", "target 문자열3"],
  ["문맥4", "target 문자열4"],
  ...
  ["문맥_N", "target 문자열_N"]
]
"""

[[', or the modern prometheus', 'frankenstein'], ['frankenstein or the modern prometheus by', ','], ['frankenstein , the modern prometheus by mary', 'or'], ['frankenstein , or modern prometheus by mary wollstonecraft', 'the'], ['frankenstein , or the prometheus by mary wollstonecraft godwin', 'modern']]


In [None]:
# Create split data
n = len(cbow_data)
def get_split(row_num):
    if row_num <= n*args.train_proportion:
        return 'train'
    elif (row_num > n*args.train_proportion) and (row_num <= n*args.train_proportion + n*args.val_proportion):
        return 'val'
    else:
        return 'test'
cbow_data['split']= cbow_data.apply(lambda row: get_split(row.name), axis=1)

In [None]:
cbow_data.head()

Unnamed: 0,context,target,split
0,", or the modern prometheus",frankenstein,train
1,frankenstein or the modern prometheus by,",",train
2,"frankenstein , the modern prometheus by mary",or,train
3,"frankenstein , or modern prometheus by mary wo...",the,train
4,"frankenstein , or the prometheus by mary wolls...",modern,train


In [None]:
cbow_data.tail()

Unnamed: 0,context,target,split
90693,our email newsletter to hear new ebooks .,about,test
90694,email newsletter to hear about ebooks .,new,test
90695,newsletter to hear about new .,ebooks,test
90696,to hear about new ebooks,.,test
90697,hear about new ebooks .,,test


In [None]:
# Write split data to file
cbow_data.to_csv(args.output_munged_csv, index=False)

In [None]:
print(args.output_munged_csv)

/content/drive/MyDrive/Github_NLP/NLP_with_PyTorch/Chapter5_Words_And_TypeEmbedding/data//books/frankenstein_with_splits.csv
