In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import glob
from dateutil.parser import parse
import re

### Review Dataset

In [111]:
path = "dataset"
data_folder = glob.glob(path)
dfs = []
for path in data_folder:
    filenames = glob.glob(path + "/*.csv")
    for filename in filenames:
        dfs.append(pd.read_csv(filename))
reviews = pd.concat(dfs, ignore_index=True).drop(columns='Unnamed: 0')

In [113]:
#Convert Python list to string
def list_to_string(lst): 
    string = " " 
    return (string.join(lst))

#Extract the number of helpful numbers
def extract_helpful_num(series):
    if "One" in series:
        return 1
    elif "people" in series:
        num = list_to_string(series.split(" ")[0]).replace(" ", "")
        return num
    else:
        return 0
    
#Preprocess data
def pre_process_data(df):
    df['date'] = df['date'].apply(lambda x: x.replace(",", "") \
                                  .split(" ")[-3:]).apply(list_to_string) \
                                  .apply(parse)
    df['ratings'] = df['ratings'].apply(lambda x: float(x[:3]))
    df['helpful'] = df['helpful'].apply(lambda x: int(extract_helpful_num(x.replace(",", ""))))
    
    return df

In [114]:
reviews = pre_process_data(reviews)
reviews

Unnamed: 0,comments,date,ratings,helpful,cs_score,category,product
0,"I would rate the 65"" CX as a 5/5 TV, but the n...",2020-04-10,2.0,966,87,Electronics > TVs,LG OLED65CXPUA TV
1,"For $1800, you would expect a TV to have a gre...",2020-05-13,4.0,359,87,Electronics > TVs,LG OLED65CXPUA TV
2,Updated review (10/13/2020)\n\nMounting was a ...,2020-05-27,5.0,254,87,Electronics > TVs,LG OLED65CXPUA TV
3,"I just got this yesterday. So this review, at ...",2020-04-23,5.0,146,87,Electronics > TVs,LG OLED65CXPUA TV
4,"Yes, this is an amazing visual product and wel...",2020-08-11,1.0,131,87,Electronics > TVs,LG OLED65CXPUA TV
...,...,...,...,...,...,...,...
84969,"Great product, decent price and super easy to ...",2021-04-06,5.0,1,80,Blood Pressure Monitors: Arm Models (-0.17),Omron 3 Series BP7100
84970,Works great for my mom,2021-03-09,5.0,0,80,Blood Pressure Monitors: Arm Models (-0.17),Omron 3 Series BP7100
84971,This is the second unit I bought. The first wa...,2021-04-19,5.0,1,80,Blood Pressure Monitors: Arm Models (-0.17),Omron 3 Series BP7100
84972,Works great,2021-04-06,5.0,0,80,Blood Pressure Monitors: Arm Models (-0.17),Omron 3 Series BP7100


In [119]:
reviews.to_csv("dataset/raw_data.csv", encoding='utf-8', index=False)

In [120]:
df = pd.read_csv('dataset/raw_data.csv')

### Price dataset

In [38]:
path = "price-dataset"
data_folder = glob.glob(path)
dfs = []
for path in data_folder:
    filenames = glob.glob(path + "/*.csv")
    for filename in filenames:
        dfs.append(pd.read_csv(filename))
prices = pd.concat(dfs, ignore_index=True).drop(columns='Unnamed: 0')

In [44]:
def preprocess_price_data(price):
    if '-' in price:
        mean_price = (float(price[0]) + float(price[2])) / 2
    elif '-' not in price:
        mean_price = float(price[0])
    else:
        mean_price = None
        
    return mean_price

In [39]:
prices['price'] = prices['price'].apply(lambda x: str(x).replace('$', '').replace(',', '').split())

In [46]:
prices['price'] = prices['price'].apply(lambda x: preprocess_price_data(x))

In [48]:
prices.to_csv("price-dataset/raw_price_data.csv", encoding='utf-8', index=False)