# Data Preparation for Sentiment Classification of Restaurant Reviews

Use a subset of the full dataset

In [220]:
import collections
import numpy as np
import pandas as pd
import re

from argparse import Namespace

In [221]:
args = Namespace(                                 # Create key, value pairs, and access values later on, e.g., args.seed
    raw_train_dataset_csv="data/data.csv",
    raw_test_dataset_csv="data/yelp/raw_test.csv",
    proportion_subset_of_train=0.1,
    train_proportion=0.7,
    val_proportion=0.15,
    test_proportion=0.15,
    output_munged_csv="data/tweets_with_splits_lite.csv",
    seed=1337
)

In [222]:
# Read raw data
train_reviews = pd.read_csv(args.raw_train_dataset_csv)

In [223]:
train_reviews.head()

Unnamed: 0,tweets,labels
0,ChatGPT: Optimizing Language Models for Dialog...,neutral
1,"Try talking with ChatGPT, our new AI system wh...",good
2,ChatGPT: Optimizing Language Models for Dialog...,neutral
3,"THRILLED to share that ChatGPT, our new model ...",good
4,"As of 2 minutes ago, @OpenAI released their ne...",bad


In [224]:
train_reviews.labels.value_counts()  # full data

bad        107796
good        56011
neutral     55487
Name: labels, dtype: int64

In [225]:
train_reviews['tweet_list'] = train_reviews['tweets'].str.split('https:')
train_reviews['tweet'] = [i[0] for i in train_reviews.tweet_list]
train_reviews = train_reviews[['labels','tweet']]
train_reviews = train_reviews[train_reviews['labels']!='neutral']
print(train_reviews.labels.value_counts())

def conv_rating(row):
    if row['labels'] == 'bad':
        return 0
    else:
        return 1
train_reviews['rating'] = train_reviews.apply(lambda x: conv_rating(x), axis=1)
train_reviews = train_reviews[['rating','tweet']]
train_reviews = train_reviews[train_reviews['tweet'].str.len() > 8]
print(train_reviews.rating.value_counts())
train_reviews.head()

bad     107796
good     56011
Name: labels, dtype: int64
0    106602
1     55805
Name: rating, dtype: int64


Unnamed: 0,rating,tweet
1,1,"Try talking with ChatGPT, our new AI system wh..."
3,1,"THRILLED to share that ChatGPT, our new model ..."
4,0,"As of 2 minutes ago, @OpenAI released their ne..."
5,1,"Just launched ChatGPT, our new AI system which..."
6,0,"As of 2 minutes ago, @OpenAI released their ne..."


In [226]:
# making the subset equal across the review classes
by_rating = collections.defaultdict(list)
for _, row in train_reviews.iterrows():
    by_rating[row.rating].append(row.to_dict())

# by_rating will be like below:
#  {
#   1 : [{'rating': 1, 'review': 'Just ...'}, {'rating': 1, 'review': 'But ...'}, ...... ], 
#   2 : [{'rating': 2, 'review': 'How ...'}, {'rating': 2, 'review': 'Here ...'}, ...... ]
#  }

In [227]:
review_subset = []

for _, item_list in sorted(by_rating.items()):

    n_total = 10000
    n_subset = n_total
    review_subset.extend(item_list[:n_subset])

review_subset = pd.DataFrame(review_subset)

In [228]:
review_subset.head()

Unnamed: 0,rating,tweet
0,0,"As of 2 minutes ago, @OpenAI released their ne..."
1,0,"As of 2 minutes ago, @OpenAI released their ne..."
2,0,GOD DAMN IT @OpenAI STOP ANNOUNCING THINGS I A...
3,0,#ai Models are set to become the search engine...
4,0,Google is done.\n\nCompare the quality of thes...


In [229]:
review_subset.rating.value_counts()  # subset data

1    10000
0    10000
Name: rating, dtype: int64

In [230]:
# Unique classes
set(review_subset.rating)

{0, 1}

In [231]:
# Splitting the subset by rating to create our new train, val, and test splits
by_rating = collections.defaultdict(list)
for _, row in review_subset.iterrows():
    by_rating[row.rating].append(row.to_dict())
    
final_list = []
np.random.seed(args.seed)

for _, item_list in sorted(by_rating.items()):

    np.random.shuffle(item_list)
    print(_)
    
    n_total = len(item_list)
    n_train = int(args.train_proportion * n_total)
    n_val = int(args.val_proportion * n_total)
    n_test = int(args.test_proportion * n_total)
    
    # Give data point a split attribute
    for item in item_list[:n_train]:
        item['split'] = 'train'
    
    for item in item_list[n_train:n_train+n_val]:
        item['split'] = 'val'
        
    for item in item_list[n_train+n_val:n_train+n_val+n_test]:
        item['split'] = 'test'

    # Add to final list
    final_list.extend(item_list) # use extend when adding multiple elements to a list; use append for additing one element.

0
1


In [232]:
# Write split data to file
final_reviews = pd.DataFrame(final_list)

In [233]:
final_reviews.head()

Unnamed: 0,rating,tweet,split
0,0,"As far as you know, my tweets are from ChatGPT",train
1,0,Asked ChatGPT to write a conversation between ...,train
2,0,I had 15 minutes before daycare pickup tonight...,train
3,0,Legendary rapper ChatGPT opens a brokerage acc...,train
4,0,ChatGPT just sitting there seething while some...,train


In [234]:
final_reviews.split.value_counts()

train    14000
test      3000
val       3000
Name: split, dtype: int64

In [235]:
# Preprocess the reviews
rep_list = ['\U0001fae1', '\\n', '@', '#', '\xa0', '***']
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\n', ' ', text)
    for i in rep_list:
        if i in text:
            text.replace(i,'')
    text = re.sub(r"([.,!?])", r" \1 ", text)     # E.g., convert "end." to "end . "; \1 indicates a matched character.
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)  # replace special character strings with empty string
    return text
    
final_reviews.tweet = final_reviews.tweet.apply(preprocess_text)

In [236]:
final_reviews['rating'] = final_reviews.rating.apply({0: 'negative', 1: 'positive'}.get)

In [237]:
final_reviews.head()

Unnamed: 0,rating,tweet,split
0,negative,"as far as you know , my tweets are from chatgpt",train
1,negative,asked chatgpt to write a conversation between ...,train
2,negative,"i had minutes before daycare pickup tonight , ...",train
3,negative,legendary rapper chatgpt opens a brokerage acc...,train
4,negative,chatgpt just sitting there seething while some...,train


In [238]:
print(final_reviews.isna().sum())

rating    0
tweet     0
split     0
dtype: int64


In [240]:
final_reviews.to_csv(args.output_munged_csv, index=False)  # don't write row names (index).