In [None]:
import re
import os
import json
import datetime as dt
import pandas as pd

In [None]:
def process_date(start_date, end_date):
  """Create helper function to use for starting, ending date of group buy
      Case1: has month day, year in the string
      Case2: Has only month and day -> has to infer from the other dates
      Case3: Sold out      
  """
  
  ymd_pattern = '(\D*)\s*(\d{1,2})\s*(?:rd|th|st|nd)*\,*\s*(\d{4})*'
  
  start_tup = re.search(ymd_pattern, start_date)
  start_m, start_d, start_y = start_tup.group(1), start_tup.group(2), start_tup.group(3)
  
  end_y = None
  end_d = None
  end_m = None
  
  if end_date != 'sold out': 
    end_tup = re.search(ymd_pattern, end_date)
    
    if end_tup is not None: 
      end_m, end_d, end_y = end_tup.group(1), end_tup.group(2), end_tup.group(3)    
    if end_y is None and start_y is not None: end_y = start_y
  
  if start_y is None and end_y is not None: start_y = end_y  
  out_start_date = dt.datetime.strptime(f"{start_d.strip()}-{start_m.strip()}-{start_y.strip()}", '%d-%B-%Y')
  
  #print(end_d, end_m, end_y)
  
  if end_date == 'sold out': 
    out_end_date = end_date
  elif end_d is None or end_d == '' or end_m is None or end_m == '':
    out_end_date = 'sold out'
  else:
    out_end_date = dt.datetime.strptime(f"{end_d.strip()}-{end_m.strip()}-{end_y.strip()}", '%d-%B-%Y')
  
  return out_start_date, out_end_date

In [None]:
def process_price(price, mode='mid'):
  """Helper Function used to process price column in case of varying price, e.g.: 30-55
      mode = 'mid': calculate the mid price
      mode = 'min': get the lower bound price
      mode = 'max': get the upper bound price
  """
   
  try:
    output_price = float(price)
  except:
    low_price = float(price[:price.find('-')])
    high_price = float(price[price.find('-'):])
    if mode =='mid': 
      output_price = (low_price + high_price) * 0.5
    elif mode == 'min':
      output_price = low_price
    elif mode == 'max':
      output_price = high_price
      
  return output_price

In [None]:
scraped_data = []
for file in os.listdir('data'):
  #scraped_data = json.load(open("scrapped_data.json", "r"))
  if file.endswith('.json'):
    scrp = json.load(open(os.path.join('data', file), 'r'))
    scraped_data.append(scrp)

### Contruct the Dataframe

In [None]:
price_pattern = '(\w*):\s*\*\*\s*\$(\d*\-*\d*)\*\*'
title_pattern = '\[(?:GB|Pre-order)\]\s*(.*)\s*//' #Trying not to lower case to preserve some kb title that has uppercase
date_pattern = '//(.*)-(.*)'
vendor_pattern = '(\w*:\s*)\[(\w*)\]'
#ship_date_pattern = '(?:est\.\s*shipping\s*date|est.\s*fulfillment\s*date):\s*\*\*(.*?)\*\*---'
ship_date_pattern = '(?:.*?):\s*\*\*(.*?)\*\*---'
#ship_date_pattern = '(?:ests*shipping\s*date|est.\s*fulfillment\s*date):\s*\*\*(.*?)\*\*---'

type_list = []
price_list = []
title_list = []
start_list = []
end_list = []
country_list = []
vendor_list = []
ship_date_list = []

for scrp in scraped_data:
  price_tup = re.findall(price_pattern, scrp['sticky_comment'].replace('\n', ''))
  try:
    title = re.findall(title_pattern, scrp['title'].replace('\n', ''))[0] #Title should match only 1 string  
  except:
    title = scrp['title'].replace('[GB]', '').strip()
  try:
    ship_date = re.findall(ship_date_pattern, scrp['sticky_comment'].replace('\n', '').lower())[0]
  except:
    ship_date = None
      
  # first index is for tuple, second index is for the starting/ending date
  try:
    start_date = re.findall(date_pattern, scrp['title'].replace('\n', ''))[0][0].strip().lower()
    end_date = re.findall(date_pattern, scrp['title'].replace('\n', ''))[0][1].strip().lower()
    start_date, end_date = process_date(start_date, end_date)
  except:
    start_date = None
    end_date = None
  
  vendor_tup = re.findall(vendor_pattern, scrp['sticky_comment'].replace('\n', ''))
  
  #print(start_date, end_date)
    
  for p_tup in price_tup: 
    for v_tup in vendor_tup:      
      
      type_list.append(p_tup[0])
      price_list.append(p_tup[1])
      
      country_list.append(v_tup[0])
      vendor_list.append(v_tup[1])

      # These 4 need to be duplicated across price/country
      title_list.append(title)
      start_list.append(start_date)
      end_list.append(end_date)
      ship_date_list.append(ship_date)

df = pd.DataFrame({'title': title_list, 'type': type_list, 'price': price_list, 'start_date': start_list, 'end_date':end_list,
                   'country': country_list, 'vendor': vendor_list, 'ship_date': ship_date_list
                  })
df['price'] = df['price'].apply(lambda x: process_price(x, mode='mid'))
#df[df['ship_date'].isna()]['title'].drop_duplicates()
#df.count()
df

### Try to Extract the type (Keyboard vs Keycap)

In [None]:
# Need heuristic to separate (unless use some model to separate pictures)
# keycap title may contains GMK, ePBT, MW, Domikey, DCS, SA
# keycap type may include Alphas, Numpad, Spacebars, keycap

# Let the rest falls down to keybaord, the hard part would be to differentiate keybard and switch
# Switch may be look from linear/tactile in the description or something

#df[df['price']<15]
#df[(df['type'].str.lower().str.contains('switch')) | (df['title'].str.lower().str.contains('switch'))]

### Distribution of price

### Distribution of Keyboard Type 

### Distribution of the Studio 

### Visualization of number of active group buy in a Year (maybe a stack chart of avaialble group buys/ price need to buy everything, etc.)

### Time from Group Buy to Release (as per the initial announcement data)