# Standard Imports

In [1032]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn

# Constants

In [1033]:
seed = 42

# Import data

In [1034]:
# Chemical and Instrumental Assessment of Green Tea Sensory Preference - Y.R. Liang, Q. Ye, J. Jin, H. Liang, J.L. Lu, Y.Y. Du & J.J. Dong
# https://doi.org/10.1080/10942910701299430
# Notes: downloaded CSVs

df_10942910701299430_chemical_composition = pd.read_csv('../data/10942910701299430/chemical_composition.csv', index_col=0, header=1)
df_10942910701299430_sensory_evaluation = pd.read_csv('../data/10942910701299430/sensory_evaluation.csv')

# Phytochemical Composition and Antioxidant Capacity of 30 Chinese Teas - by Guo-Yi Tang, Cai-Ning Zhao, Xiao-Yu Xu, Ren-You Gan, Shi-Yu Cao, Qing Liu, AoShang, Qian-Qian Mao & Hua-Bin Li
# https://doi.org/10.3390/antiox8060180
# Notes: Manually scraped data

df_antiox8060180_chemical_composition = pd.read_csv('../data/antiox8060180/chemical_composition.csv')

# Catechin and caffeine content of tea (Camellia sinensis L.) leaf significantly differ with seasonal variation... - Himangshu Deka, Tupu Barman, Jintu Dutta, Arundhuti Devi, Pradip Tamuly, Ranjit Kumar Paul & Tanmoy Karak 
# https://doi.org/10.1016/j.jfca.2020.103684
# https://krishi.icar.gov.in/jspui/bitstream/123456789/68751/2/S0889157520313892-main.pdf - Free access

# df_j.jfca.2020.103684_chemical_composition = pd.read_csv('dava/j.jfca.2020.103684/chemical_composition.csv')

# Comparative analysis of tea catechins and theaflavins by high-performance liquid chromatography and capillary electrophoresis - Bee-Lan Lee & Choon-Nam Ong
# https://doi.org/10.1016/S0021-9673(00)00215-6
# Notes: Manually scraped data

df_S0021967300002156 = pd.read_csv('../data/S0021967300002156/chemical_composition.csv')

# Survey of Catechins, Gallic Acid, and Methylxanthines in Green, Oolong, Pu-erh, and Black Teas - Jen-Kun Lin, Chih-Li Lin, Yu-Chih Liang, Shoei-Yn Lin-Shiau & I-Ming Juan
# https://doi.org/10.1021/jf980223x
# Notes: Manually scraped data

df_jf980223x = pd.read_csv('../data/jf980223x/chemical_composition.csv')


In [1035]:
# List of all imported dataframes
all_dataframes = [df_10942910701299430_chemical_composition,
                  df_10942910701299430_sensory_evaluation,
                  df_antiox8060180_chemical_composition,
                  df_S0021967300002156,
                  df_jf980223x]

# Combine data

To combine the data I am going to create a data pipeline that does the following:

1. Add all the pandas dataframes to a list.
2. Read that list into a function.
3. The function reads an individual dataframe from that list then resets the index, rename catechins to be like the catechin dictionary below, drop unused columns, add columns that are part of the standardized column format, and then sorts the columns to be part of the standardized column format.
4. The function will then add the dataframe to a list denoting it is processed.
5. Once the function has processed all the dataframes in the lists it will loop through the data frames in the finished list and concatenate them.
6. The function will return a combined dataframe.

## Data Frame Format

The data frames will be combined into the following format:

- `'Catechin'`
- `'Epicatechin'`
- `'Gallocatechin'`
- `'Epigallocatechin'`
- `'Catechin Gallate'`
- `'Epicatechin Gallate'`
- `'Gallocatechin Gallate'`
- `'Epigallocatechin Gallate'`
- `'Gallic Acid'`
- `'Chlorogenic Acid'`
- `'Caffeine'`
- `'Taste'`
- `'Appearance'`
- `'Aroma'`
- `'Liqour color'`


In [1036]:
catechin_dictionary = {
    'C': 'Catechin',
    'EC': 'Epicatechin',
    'GC': 'Gallocatechin',
    'EGC': 'Epigallocatechin',
    'CG': 'Catechin Gallate',
    'ECG': 'Epicatechin Gallate',
    'GCG': 'Gallocatechin Gallate',
    'EGCG': 'Epigallocatechin Gallate'
}

standardized_columns = [
    'Catechin',
    'Epicatechin',
    'Gallocatechin',
    'Epigallocatechin',
    'Catechin Gallate',
    'Epicatechin Gallate',
    'Gallocatechin Gallate',
    'Epigallocatechin Gallate',
    'Gallic Acid',
    'Chlorogenic Acid',
    'Caffeine',
    'Taste',
    'Appearance',
    'Aroma',
    'Liqour color'
]

In [1037]:
df_jf980223x['EC'].dtype == 'float64'

True

To faciliate scability I am create a sklearn pipeline and process all my data to be concatenated.
First I will define the functions that will go into my data preprocessing pipeline.

In [1038]:
def reset_index(dataframe: pd.DataFrame):
    return dataframe.reset_index(drop=1)

def rename_columns(dataframe: pd.DataFrame):
    return dataframe.rename(columns=catechin_dictionary)

def drop_nonstandardized_columns(dataframe: pd.DataFrame):
    return dataframe.drop(columns=[column for column in dataframe if column not in standardized_columns])

def reformat_columns(dataframe: pd.DataFrame):
    return dataframe.reindex(columns=standardized_columns)

def remove_standard_deviation_format(dataframe: pd.DataFrame):
    for column in range(len(dataframe)):
        if (dataframe.iloc[column].dtype == 'obect') or (dataframe.iloc[column].dtype == 'string'):
            dataframe.iloc[column] = dataframe.iloc[column].str.replace('± \d*.\d*', '', regex=True)
    return dataframe    

Next, the data pipeline applies all of those functions to the inputed dataset.

In [1039]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer

ResetIndexTransformer = FunctionTransformer(reset_index)
RenameColumnsTransformer = FunctionTransformer(rename_columns)
DropNonstandardizedColumnsTransformer = FunctionTransformer(drop_nonstandardized_columns)
ReformatColumnsTransformer = FunctionTransformer(reformat_columns)
RemoveStandardDeviationFormat = FunctionTransformer(remove_standard_deviation_format)

data_combining_pipeline = make_pipeline(
    ResetIndexTransformer,
    RenameColumnsTransformer,
    DropNonstandardizedColumnsTransformer,
    ReformatColumnsTransformer,
    RemoveStandardDeviationFormat
)


Now I am creating a pipeline that transforms each dataframe then concatenates it and returns one dataframe to be used for the model.

In [1040]:
def process_dataframes(dataframes: list):
    processed_dataframes = []
    for dataframe in dataframes:
        dataframe = data_combining_pipeline.fit_transform(dataframe)
        processed_dataframes.append(dataframe)
    return processed_dataframes

def concatenate_dataframes(dataframes: list):
    formated_dataframe = pd.concat(dataframes, ignore_index=True)
    return formated_dataframe


In [1041]:
ProcessDataframes = FunctionTransformer(process_dataframes)
ConcatenateDataframes = FunctionTransformer(concatenate_dataframes)

data_combining_pipeline_processor = make_pipeline(
    ProcessDataframes,
    ConcatenateDataframes
)

In [1055]:
df = data_combining_pipeline_processor.transform(all_dataframes)
df

Unnamed: 0,Catechin,Epicatechin,Gallocatechin,Epigallocatechin,Catechin Gallate,Epicatechin Gallate,Gallocatechin Gallate,Epigallocatechin Gallate,Gallic Acid,Chlorogenic Acid,Caffeine,Taste,Appearance,Aroma,Liqour color
0,8.93 ± 0.09,7.04 ± 07,8.36 ± 0.08,18.00 ± 0.16,3.72 ± 0.04,4.66 ± 0.07,21.50 ± 0.38,21.11 ± 0.28,,,31.47 ± 0.22,,,,
1,9.87 ± 008,.4.99 ± 0.04,22.55 ± 0.21,10.92 ± 0.08,5.33 ± 0.04,5.29 ± 0.09,26.62 ± 0.34,23.42 ± 0.18,,,39.24 ± 0.18,,,,
2,6.37 ± 0.07,4.92 ± 0.06,21.87 ± 0.17,10.98 ± 0.08,5.29 ± 0.07,5.43 ± 0.07,24.84 ± 0.42,23.63 ± 0.21,,,41.08 ± 0.32,,,,
3,6.78 ± 0.03,5.91 ± 0.07,23.08 ± 0.18,11.47 ± 0.12,5.76 ± 0.06,6.43 ± 0.06,25.34 ± 0.27,22.14 ± 0.20,,,50.97 ± 0.29,,,,
4,5.82 ± 0.07,5.60 ± 0.07,24.12 ± 0.22,13.35 ± 0.11,5.28 ± 0.06,5.94 ± 0.04,27.92 ± 0.32,25.09 ± 0.22,,,53.70 ± 0.45,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76,0.02,1.06,,1.17,,1.84,0.26,13.77,,,6.72,,,,
77,0.02,1.15,,1.19,,2.59,0.56,18.81,,,8.62,,,,
78,0.02,0.92,,0.95,,2.04,0.33,14.88,,,6.9,,,,
79,0.03,1.05,,0.93,,1.95,0.24,14.69,,,7.8,,,,


# Visualize data

# Data Wrangling

# Training Device

In [1043]:
# Get cpu, gpu or mps device for training.
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cpu device


# Generative Adversarial Network

In [1044]:
class Generator(nn.Module):
    def __init__(self, ngpu):
            super(Generator, self).__init__()
            self.ngpu = ngpu

class Discriminator(nn.Module):    
    def __init__(self, ngpu):
        super(Discriminator, self).__init__()
        self.ngpu = ngpu

# Split data

In [1045]:
from torch.utils.data import DataLoader, random_split

train_size = int(0.8 * len(df))
test_size = len(df) - train_size
train_df, test_df = random_split(df, [train_size, test_size])

train_loader = DataLoader(train_df, batch_size=64, shuffle=True)
test_loader = DataLoader(test_df, batch_size=64, shuffle=True)

# Feature Engineering

# DBSCAN

In [1046]:
# Model Creation

In [1047]:
# Hyperparameter Selection

In [1048]:
# Model Training

# Multilayer Perceptron

In [1049]:
# Model Creation

In [1050]:
# Hyperparameter Selection

In [1051]:
# Model Training

# Convolutiontional Neural Network

In [1052]:
# Model Creation

In [1053]:
# Hyperparameter Selection


In [1054]:
# Model Training

# Model Visualization and Comparison

# Export model