In [1]:
import pandas as pd
import numpy as np 
import re 
import collections
from argparse import Namespace  

In [2]:
# load data
args = Namespace(
    raw_train_dataset_csv="../NLP(old)/data/raw_train.csv",
    raw_test_dataset_csv="../NLP(old)/data/raw_test.csv",
    train_proportion=0.7,
    val_proportion=0.3,
    output_munged_csv="../NLP(old)/data/reviews_with_splits_full.csv",
    seed=1337
)

# # load data
# args = Namespace(
#     raw_train_dataset_csv="./data/raw_train.csv",
#     raw_test_dataset_csv="./data/raw_test.csv",
#     train_proportion=0.7,
#     val_proportion=0.3,
#     output_munged_csv="./data/reviews_with_splits_full.csv",
#     seed=1337
# )


train_reviews = pd.read_csv(args.raw_train_dataset_csv, header=None, names=['rating', 'review'])
train_reviews = train_reviews[~pd.isnull(train_reviews.review)]
test_reviews = pd.read_csv(args.raw_test_dataset_csv, header=None, names=['rating', 'review'])
test_reviews = test_reviews[~pd.isnull(test_reviews.review)]

In [4]:
train_reviews.head()
# 1: negative, 2: positive

Unnamed: 0,rating,review
0,1,"Unfortunately, the frustration of being Dr. Go..."
1,2,Been going to Dr. Goldberg for over 10 years. ...
2,1,I don't know what Dr. Goldberg was like before...
3,1,I'm writing this review to give you a heads up...
4,2,All the food is great here. But the best thing...


In [8]:
train_reviews.review.values[0]

"Unfortunately, the frustration of being Dr. Goldberg's patient is a repeat of the experience I've had with so many other doctors in NYC -- good doctor, terrible staff.  It seems that his staff simply never answers the phone.  It usually takes 2 hours of repeated calling to get an answer.  Who has time for that or wants to deal with it?  I have run into this problem with many other doctors and I just don't get it.  You have office workers, you have patients with medical needs, why isn't anyone answering the phone?  It's incomprehensible and not work the aggravation.  It's with regret that I feel that I have to give Dr. Goldberg 2 stars."

In [9]:
train_reviews.review.values[1]

"Been going to Dr. Goldberg for over 10 years. I think I was one of his 1st patients when he started at MHMG. He's been great over the years and is really all about the big picture. It is because of him, not my now former gyn Dr. Markoff, that I found out I have fibroids. He explores all options with you and is very patient and understanding. He doesn't judge and asks all the right questions. Very thorough and wants to be kept in the loop on every aspect of your medical health and your life."

In [5]:
test_reviews.head()

Unnamed: 0,rating,review
0,1,Ordered a large Mango-Pineapple smoothie. Stay...
1,2,Quite a surprise! \n\nMy wife and I loved thi...
2,1,"First I will say, this is a nice atmosphere an..."
3,2,I was overall pretty impressed by this hotel. ...
4,1,Video link at bottom review. Worst service I h...


In [6]:
# 별점 순으로 dictionary화
from tqdm import tqdm
by_rating = collections.defaultdict(list)
for _, row in tqdm(train_reviews.iterrows(), total = len(train_reviews)) :
    by_rating[row.rating].append(row.to_dict())

100%|██████████| 560000/560000 [00:57<00:00, 9689.51it/s] 


In [13]:
# 분할 데이터를 만듭니다.
final_list = []
np.random.seed(args.seed)

for _, item_list in sorted(by_rating.items()):

    np.random.shuffle(item_list)
    
    n_total = len(item_list)
    n_train = int(args.train_proportion * n_total)
    n_val = int(args.val_proportion * n_total)
    
    # 데이터 포인터에 분할 속성을 추가합니다
    for item in item_list[:n_train]:
        item['split'] = 'train'
    
    for item in item_list[n_train:n_train+n_val]:
        item['split'] = 'val'

    # 최종 리스트에 추가합니다
    final_list.extend(item_list)


# test 데이터셋도 최종 리스트에 추가
for _, row in test_reviews.iterrows():
    row_dict = row.to_dict()
    row_dict['split'] = 'test'
    final_list.append(row_dict)

In [16]:
# train + val + test까지 포함된 데이터셋
len(final_list)

598000

In [19]:
# dataframe으로 만들기 
final_reviews = pd.DataFrame(final_list)
final_reviews.split.value_counts()

split
train    392000
val      168000
test      38000
Name: count, dtype: int64

In [21]:
# 
final_reviews[pd.isnull(final_reviews.review)]

Unnamed: 0,rating,review,split


In [24]:
text = final_reviews.review.values[0]
print(text)

The entrance was the #1 impressive thing about this place, as it is completely a surprise and almost shocks you.  I won't give it up here but it's worth at least getting a drink to experience that part.\n\nGreeter was great.\n\nLounge singer was very 70s and campy, but liked him none the less.\n\n\nFood is pretty below average in pretty much every way possible.\n\nBread they bring out is pretty bad and dries out within minutes of being at the table to the point it's inedible.\n\nSalads were small, boring and WAY drowned in dressing.\n\nCalamari is $15 or so, and is half the size of what they'd give you at Capitol Grille, and is pretty bland and mushy.\n\nthe pasta tasted as though it were cooked in rancid water.  the chef was nice enough to come out and bring my wife a personalized dish that he eats, and it tasted the same.  we smiled and pretended to enjoy it just to keep from looking like big complainers.  We literally don't complain usually, but her first dish was inedible.\n\nMy la

In [25]:
re.sub(r"([.,!?])", r" \1 ", text)

"The entrance was the #1 impressive thing about this place ,  as it is completely a surprise and almost shocks you .   I won't give it up here but it's worth at least getting a drink to experience that part . \\n\\nGreeter was great . \\n\\nLounge singer was very 70s and campy ,  but liked him none the less . \\n\\n\\nFood is pretty below average in pretty much every way possible . \\n\\nBread they bring out is pretty bad and dries out within minutes of being at the table to the point it's inedible . \\n\\nSalads were small ,  boring and WAY drowned in dressing . \\n\\nCalamari is $15 or so ,  and is half the size of what they'd give you at Capitol Grille ,  and is pretty bland and mushy . \\n\\nthe pasta tasted as though it were cooked in rancid water .   the chef was nice enough to come out and bring my wife a personalized dish that he eats ,  and it tasted the same .   we smiled and pretended to enjoy it just to keep from looking like big complainers .   We literally don't complain 

In [27]:
# 텍스트(review) 전처리
def preprocessing_text(text) :
    if type(text) == float :
        print(text)  # NaN 확인
    text = text.lower() 
    text = re.sub(r"([.,!?])", r" \1 ", text)  # \1 : 그룹 내 동일한 단어
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)  # 
    return text


final_reviews.review = final_reviews.review.apply(preprocessing_text)


In [31]:
final_reviews

Unnamed: 0,rating,review,split
0,1,the entrance was the impressive thing about th...,train
1,1,"i m a mclover , and i had no problem nwith the...",train
2,1,"less than good here , not terrible , but i see...",train
3,1,i don t know if i can ever bring myself to go ...,train
4,1,food was ok good but the service was terrible ...,train
...,...,...,...
597995,1,after spending bucks per person for express la...,test
597996,2,stellar ! one of my favorite places to eat in ...,test
597997,1,we stopped by here for a dessert after fuddruc...,test
597998,1,wait staff was attentive but the food was very...,test


In [35]:
final_reviews['rating'] = final_reviews.rating.apply({1: 'negative', 2: 'positive'}.get)

In [36]:
final_reviews.to_csv(args.output_munged_csv, index=False)

In [37]:
args.output_munged_csv

'../NLP(old)/data/reviews_with_splits_full.csv'