<a href="https://colab.research.google.com/github/AdriaDelhom/CGM/blob/main/GI_scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
os.chdir('/content/drive/MyDrive/CGM/data')

Mounted at /content/drive


In [None]:
ACC_HZ = 32
ACC_G_MODE = 2
SMALL_G = 9.81

In [None]:
class Patient:

  metric_map = {
      'acc': 'ACC',
      'bvp': 'BVP',
      'glu': 'Dexcom',
      'eda': 'EDA',

      'food': 'Food_Log',
      'hr': 'HR',
      'ibi': 'IBI',
      'temp': 'TEMP'
  }

  def __init__(self, patient_id, gender, hba1c) -> None:
    self.patient_id: int = patient_id
    self.gender: str = gender
    self.hba1c: float = hba1c

    self._acc: pd.DataFrame = None
    self._bvp: pd.DataFrame = None
    self._glu: pd.DataFrame = None
    self._eda: pd.DataFrame = None
    self._food: pd.DataFrame = None
    self._hr: pd.DataFrame = None
    self._ibi: pd.DataFrame = None
    self._temp: pd.DataFrame = None


  def get_file_path(self, metric):
    metric_name = Patient.metric_map[metric]
    return f'./{self.patient_id:03d}/{metric_name}_{self.patient_id:03d}.csv'

  @property
  def acc(self):
    if self._acc is None:
      self._acc = pd.read_csv(self.get_file_path('acc'), index_col=['datetime'],
                              parse_dates=['datetime'], engine='pyarrow')
      self._acc.columns = [c.strip() for c in self._acc.columns]
      # Convert all columns to m/s^2 from int8 in 2G mode
      # TODO It is possible that values <0 should be divided by 128 and > 0 by 127
      self._acc['acc_x'] = (self._acc['acc_x'] * ACC_G_MODE * SMALL_G) / 127
      self._acc['acc_y'] = (self._acc['acc_y'] * ACC_G_MODE * SMALL_G) / 127
      self._acc['acc_z'] = (self._acc['acc_z'] * ACC_G_MODE * SMALL_G) / 127
    return self._acc

  @property
  def bvp(self):
    if self._bvp is None:
      self._bvp = pd.read_csv(self.get_file_path('bvp'), index_col=['datetime'],
                              parse_dates=['datetime'], engine='pyarrow')
      self._bvp.columns = [c.strip() for c in self._bvp.columns]
    return self._bvp


  @property
  def glu(self):
    if self._glu is None:
      self._glu = pd.read_csv(self.get_file_path('glu'), header=0,
                              skiprows=range(1, 13),
                              index_col=['Timestamp (YYYY-MM-DDThh:mm:ss)'],
                              parse_dates=['Timestamp (YYYY-MM-DDThh:mm:ss)'])
      self._glu = self._glu.rename(columns={'Glucose Value (mg/dL)': 'glucose'})
      self._glu = self._glu.rename_axis('datetime')
      self._glu = self._glu[['glucose']]
    return self._glu

  @property
  def eda(self):
    if self._eda is None:
      self._eda = pd.read_csv(self.get_file_path('eda'), index_col=['datetime'],
                              parse_dates=['datetime'], engine='pyarrow')
      self._eda.columns = [c.strip() for c in self._eda.columns]
    return self._eda

  @property
  def food(self):
    if self._food is None:
      if self.patient_id == 3:
        # Patient 3 has no header
        self._food = pd.read_csv(self.get_file_path('food'), skipinitialspace=True)
        self._food.columns = ['date', 'time', 'time_begin', 'logged_food', 'amount', 'unit', 'searched_food', 'calorie', 'total_carb', 'sugar', 'protein']
        self._food['time_begin'] = pd.to_datetime(self._food['time_begin'])
        self._food['time_end'] = None
        self._food = self._food.set_index('time_begin')
      else:
        self._food = pd.read_csv(self.get_file_path('food'), index_col=['time_begin'], parse_dates=['time_begin'], skipinitialspace=True)

      self._food['time_end'] = pd.to_datetime(self._food['date'] + ' ' + self._food['time_end'])

      end_times = self._food.groupby('time_begin')['time_end'].min()
      self._food = self._food.merge(end_times, how='left', on='time_begin', suffixes=('','_x'), validate='many_to_one')
      self._food['time_end'] = self._food['time_end'].fillna(self._food['time_end_x'])
      self._food = self._food.drop('time_end_x', axis=1)

      # Fill in NaN in searched food with empty string
      self._food['searched_food'] = self._food['searched_food'].fillna('')

      # Some food logs have `time_of_day` instead of `time`
      if self.patient_id in [7, 13, 15, 16]:
        self._food = self._food.drop(['date', 'time_of_day'], axis=1)
      else:
        self._food = self._food.drop(['date', 'time'], axis=1)
    return self._food

  @property
  def hr(self):
    if self._hr is None:
      # Patient 1 has no seconds recorded
      if self.patient_id == 1:
        self._hr = pd.read_csv('./001/HR_001.csv',
                              parse_dates=['datetime'],
                              date_format='%m/%d/%y %H:%M',
                              engine='pyarrow')
        self._hr.loc[self._hr.groupby('datetime').cumcount() + 1 != 1, 'datetime'] = pd.NaT
        self._hr = self._hr.interpolate(method='linear')
        self._hr = self._hr.set_index('datetime')
      else:
        self._hr = pd.read_csv(self.get_file_path('hr'), index_col=['datetime'],
                              parse_dates=['datetime'], engine='pyarrow')
      self._hr.columns = [c.strip() for c in self._hr.columns]

    return self._hr

  @property
  def ibi(self):
    if self._ibi is None:
      self._ibi = pd.read_csv(self.get_file_path('ibi'), index_col=['datetime'],
                              parse_dates=['datetime'], engine='pyarrow')
      self._ibi.columns = [c.strip() for c in self._ibi.columns]
    return self._ibi

  @property
  def temp(self):
    if self._temp is None:
      self._temp = pd.read_csv(self.get_file_path('temp'), index_col=['datetime'],
                              parse_dates=['datetime'], engine='pyarrow')
      self._temp.columns = [c.strip() for c in self._temp.columns]
    return self._temp

class CGMData:
  patients: Patient = {}
  demographics: pd.DataFrame = None

  def __init__(self) -> None:
    self.demographics = pd.read_csv('./Demographics.csv', index_col='ID')

  def __getitem__(self, key):
    if key < 1 or key > 16:
      raise IndexError(f'Patient ID {key} out of range')
    if key not in self.patients:
      patient_demo = self.demographics.loc[key]
      self.patients[key] = Patient(key, patient_demo['Gender'], patient_demo['HbA1c'])
    return self.patients[key]

In [None]:
data = CGMData()
print(data[1].gender)
print(data[1].hba1c)
print(data[1].glu.head(5))
print(data[1].food.head())

FEMALE
5.5
                     glucose
datetime                    
2020-02-13 17:23:32     61.0
2020-02-13 17:28:32     59.0
2020-02-13 17:33:32     58.0
2020-02-13 17:38:32     59.0
2020-02-13 17:43:31     63.0
                    time_end                    logged_food  amount  \
time_begin                                                            
2020-02-13 18:00:00      NaT                 Berry Smoothie   20.00   
2020-02-13 20:30:00      NaT                    Chicken Leg    1.00   
2020-02-13 20:30:00      NaT                      Asparagus    4.00   
2020-02-14 07:10:00      NaT  Natrel Lactose Free 2 Percent    8.00   
2020-02-14 07:10:00      NaT             Standard Breakfast    0.75   

                            unit  \
time_begin                         
2020-02-13 18:00:00  fluid ounce   
2020-02-13 20:30:00          NaN   
2020-02-13 20:30:00          NaN   
2020-02-14 07:10:00  fluid ounce   
2020-02-14 07:10:00          cup   

                                   

In [None]:
pd.unique(data[2].food['searched_food'])

array(['Mello Yello',
       '(Jimmy Dean) Sandwiches, Biscuit Southern Style Chicken',
       'Large Beef Jerky', 'Gatorade Fierce Grape',
       'Banquet Chicken Pot Pie',
       'Red Baron, Brick Oven Crust Pizza, Pepperoni',
       "M&M's Milk Chocolate Candies (formerly M&M's Plain Chocolate Candies)",
       '(Natrel) Lactose Free 2% Partly Skimmed Milk',
       "(Kellogg's) Frosted Flakes, Cereal", 'Powerade Sports Drink',
       "Stouffer's, Salisbury Steak", 'Mashed Potato', 'Chocolate Milk',
       'Frozen novelties, ice type, pop', 'Bacon And Cheese Omelette',
       "(Arby's) Classic Roast Beef", 'Tortilla Chips', 'Salsa',
       'Mountain Dew', 'Cheeseburger With Chili', 'Tater Tots',
       'Fast foods, onion rings, breaded and fried', 'Sweet Tea',
       'NABISCO, NEWTONS, FRUIT CHEWY COOKIES, FIG', 'Water',
       'Gatorade Fruit Punch', 'Mandarin Oranges', 'Vienna Sausage',
       '(Lance) Toast Chee Crackers, Cheddar Cheese', 'Tootsie Roll',
       'Steak', 'Fried Pot

# Web Scraping

In [None]:
from bs4 import BeautifulSoup
import requests

In [None]:
URL = "https://glycemicindex.com/gi-search/?food_name=frosted"
page = requests.get(URL)

In [None]:
soup = BeautifulSoup(page.text, 'html.parser')
print(soup.prettify)

In [None]:
soup.find_all('tr')[0]

<tr class="row-1 odd">
<th class="column-1">Food Name</th><th class="column-2">GI</th><th class="column-3">Food Manufacturer</th><th class="column-4">Product Category</th><th class="column-5">Country of food production</th><th class="column-6">Serving Size (g)</th><th class="column-7">Average carbohydrate portion (g)</th><th class="column-8">GL (based on average carbohydrate potion)</th><th class="column-9">Reference:</th><th class="column-10">Subjects type</th><th class="column-11">Time</th><th class="column-12">Subjects Number</th><th class="column-13">Year of test</th>
</tr>

In [None]:
row_number = 55
[soup.find_all('tr')[row_number].find_all('td')[0].string, soup.find_all('tr')[row_number].find_all('td')[5].string, soup.find_all('tr')[row_number].find_all('td')[6].string]

['Pancakes, prepared from wheat flour', None, '30']

In [None]:
data = []
first = True
for row in soup.find_all('tr'):
  if first:
    first = False
    continue
  data.append([row.find_all('td')[0].string, row.find_all('td')[1].string, row.find_all('td')[5].string,row.find_all('td')[6].string, row.find_all('td')[7].string])

In [None]:
scraped_data = pd.DataFrame(data, columns = ['Food Name', 'GI', 'Serving Size', 'Carb Serving', 'GL'])

In [None]:
scraped_data.head()

Unnamed: 0,Food Name,GI,Serving Size,Carb Serving,GL
0,"Cake, NS, decreased GI variant, sugar-to-flour...",20,,30,6
1,"Carrot cake, prepared with wheat flour and coc...",37,,30,11
2,Chocolate cake made from packet mix with choco...,38,,30,11
3,Chocolate mudcake,43,,30,13
4,Christmas fruit cake,53,,30,16


In [None]:
scraped_data.iloc[1,1]

'37'

In [None]:
scraped_data.to_csv('GI_data.csv', sep=',', na_rep='NaN')

Matching GI with foods was done locally (LLM_GI.ipynb)

# Assembling GI Dataframe

In [None]:
import pickle

with open('GI_known_dict.pkl', 'rb') as f:
    GI_known = pickle.load(f)

with open('GI_guessed_dict.pkl', 'rb') as f:
    GI_guessed = pickle.load(f)

In [None]:
GI_values = {'High': 85, 'Med': 62, 'Low': 28}

In [None]:
GI_known['Baked Beans']

Food Name       Baked Beans in Cheesy Tomato sauce
GI                                              44
Serving Size                                   NaN
Carb Serving                                  15.0
GL                                             7.0
Name: 1277, dtype: object

In [None]:
GI_dataframe = []
for patient_id in range(1, 17):
  patient_data = []
  for idx, row in data[patient_id].food.iterrows():
    if row['searched_food'] in GI_known:
      patient_data.append([GI_known[row['searched_food']]['Food Name'], GI_known[row['searched_food']]['GI']])
    else:
      if row['searched_food'] in GI_guessed:
        patient_data.append([GI_guessed[row['searched_food']], GI_values[GI_guessed[row['searched_food']]]])
      else:
        patient_data.append(['High', GI_values['High']]) #for the very last entries, 'Boost' (which seems to be a high-sugar energy drink)
  GI_dataframe.append(pd.DataFrame(patient_data, columns = ['GI_Food', 'GI']))

In [None]:
for i in range(16):
  GI_dataframe[i].index = data[i+1].food.index

In [None]:
for i in range(16):
  file_name = 'GI_' + str(i+1) + '.csv'
  GI_dataframe[i].to_csv(file_name, sep=',', na_rep='NaN')