In [17]:
import torch
from torch import nn
import os
import glob as glob
from torch.utils.data import DataLoader, Dataset, Subset
import torch.optim as optim
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import time
from tqdm import tqdm
from multiprocessing import Process
import csv

In [4]:
class B330_Dataset(Dataset):
    def __init__(self, file_dir, transform=None, target_transform=None):
        self.file_dir = file_dir
        self.data_frame = pd.read_csv(self.file_dir)
        self.features = []
        self.df_labels = []
        for i in range(len(self.data_frame['soln'])):
            self.df_labels.append([self.data_frame['soln'].iloc[i][2:-1],self.data_frame['state'].iloc[i][2:-1],self.data_frame['binary'].iloc[i][2:-2]])
    
        self.label_use = {}
        self.label_data = {}
        self.one_hot_labels = {}
        self.labels = []
        self.test_flag = False
        for i in range(len(self.df_labels)):
            if self.df_labels[i][0] not in self.labels:
                self.labels.append(self.df_labels[i][0])
                    
        for i in range(len(self.df_labels)):
            if str(self.df_labels[i]) not in self.one_hot_labels:
                z = np.zeros(len(self.labels))
                for j in range(len(self.df_labels[i])):
                    idx = self.labels.index(self.df_labels[i][j])
                    z[idx] = 1
                self.one_hot_labels[str(self.df_labels[i])] = z
                
                
        for label in self.labels:
            if label in []:
                pass
            else:
                self.label_use[label] = 0
                self.label_data[label] = []
        
        for i in range(len(self.data_frame)):
            self.label_data[self.df_labels[i][0]].append(i)
            
        self.transform = transform
        self.target_transform = target_transform
        
    def __len__(self):
        return len(self.data_frame)
    
    def label_reset(self):
        lu = 0
        for label in self.label_use:
            lu+= self.label_use[label]
        if lu == len(self.label_use):
            for label in self.label_use:
                self.label_use[label] = 0
            
    def __getitem__(self, idx):
        self.label_reset()
        if self.test_flag == True:
            data = self.data_frame[self.features].iloc[idx]
            one_hot_label = self.one_hot_labels[str(self.df_labels[idx])]
        else:
            for label in self.label_use:
                if self.label_use[label] == 0:
                    self.label_use[label] = 1
                    break
            idx = random.choice(self.label_data[label])
            data = self.data_frame[self.features].iloc[idx]
            one_hot_label = self.one_hot_labels[str(self.df_labels[idx])]
        
        if self.transform:
            data = self.transform(data)
        if self.target_transform:
            one_hot_label = self.target_transform(one_hot_label)

        return np.array(data), one_hot_label

In [5]:
dataset = B330_Dataset('Data/data_file.txt')

In [6]:
idx = np.random.randint(0, dataset.__len__())

dataset.__getitem__(idx)

(array([ 0.91449511,  0.90797562,  0.03400922, -0.02438878, -0.02438878,
         0.08550489,  0.09202438,  0.03400922,  0.02438878,  0.02438878,
         0.6231454 ,  0.61420835,  0.19442608, -0.08477744, -0.08477744,
         0.3768546 ,  0.38579165,  0.19442608,  0.08477744,  0.08477744,
         0.0934996 ,  0.0858046 ,  0.0627877 , -0.007413  , -0.007413  ,
         0.604762  ,  0.531762  ,  0.27588846,  0.076719  ,  0.076719  ,
         0.137215  ,  0.127044  ,  0.0537894 ,  0.006404  ,  0.006404  ]),
 array([1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.]))

In [7]:
dataset.__len__()

86368

In [8]:
trainset = Subset(dataset, range(0,int(len(dataset)*0.7)))
testset = Subset(dataset, range(int(len(dataset)*0.7), len(dataset)))

In [9]:
trainset.__len__()

60457

In [10]:
train_loader = DataLoader(trainset, batch_size=128, shuffle=True)
test_loader = DataLoader(testset, batch_size=128, shuffle=False)

In [11]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(35, 1024),
            nn.ReLU(),
            nn.Linear(1024, 4096),
            nn.ReLU(),
            nn.Linear(4096, 1024),
            nn.ReLU(),
            nn.Linear(1024, 21), #41
            #nn.Softmax(dim=1))
            nn.Sigmoid())
        
    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

In [12]:
model = NeuralNetwork()
print(model)

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=35, out_features=1024, bias=True)
    (1): ReLU()
    (2): Linear(in_features=1024, out_features=4096, bias=True)
    (3): ReLU()
    (4): Linear(in_features=4096, out_features=1024, bias=True)
    (5): ReLU()
    (6): Linear(in_features=1024, out_features=21, bias=True)
    (7): Sigmoid()
  )
)


In [13]:
dataset.labels

['Stable',
 'Transitory',
 'Baseline',
 'Solution',
 'Background',
 'wet paprika',
 'dry turmeric',
 'Riboflavin',
 'Wet_BG',
 'YG',
 'wet B12',
 'Baby Powder',
 'dry niacin',
 'Red Star Yeast',
 'wet red star yeast',
 'Confidence Check',
 'Hot Shot Confidence Check',
 'Baking Powder',
 'Distilled Water',
 'wet niacin',
 'wet tofu miso soup']

In [16]:
dataset.one_hot_labels

{"['Background', 'Stable', 'Baseline']": array([1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.]),
 "['wet paprika', 'Stable', 'Solution']": array([1., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.]),
 "['dry turmeric', 'Stable', 'Solution']": array([1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.]),
 "['Riboflavin', 'Stable', 'Solution']": array([1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.]),
 "['Wet_BG', 'Stable', 'Solution']": array([1., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.]),
 "['YG', 'Stable', 'Solution']": array([1., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.]),
 "['wet B12', 'Stable', 'Solution']": array([1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.]),
 "['Baby Powder', 

In [23]:
coder_path = './coder.txt'
with open(coder_path,'w') as f:
    for label in dataset.labels:
        f.write(label+'\n')
    

In [14]:
weight = []
for i in range(21):
    if i <2:
        weight.append(0.1)
    elif i<5 and i>=2:
        weight.append(0.5)
    else:
        weight.append(1)
weight = torch.from_numpy(np.array(weight))
loss_fn = nn.BCELoss(weight = weight) #nn.CrossEntropyLoss()
opt = optim.Adam(model.parameters(), lr=1e-3)

In [15]:
old_loss = 1000
path = './model_MkVI.pth'
for epoch in range(1):
    train_loader = DataLoader(trainset, batch_size=128, shuffle=True)
    with tqdm(train_loader, unit='batch') as tepoch:
        start = time.time()
        #train_loader = DataLoader(trainset, batch_size=128, shuffle=True)
        #for i, data in enumerate(train_loader, 0):
        correct = 0
        total = 0
        for data, target in tepoch:
            score = 0
            total+=1
            tepoch.set_description(f'Epoch {epoch+1}')
            #inputs, labels = data
            #print(labels.shape)
            opt.zero_grad()

            outputs = model(data.float())
            #print(outputs.shape, labels[0])
            loss = loss_fn(outputs, target.float())
            loss.backward()
            opt.step()
            
#             _, predicted = torch.max(outputs.data, 1)
#             _, actual = torch.max(target, 1)
#             total+=target.size(0)
#             correct+=(predicted==actual).sum().item()
            
            predicted = []
            actual = []
            for i in range(len(outputs[0])):
                if outputs[0][i] >= 0.5:
                    predicted.append(dataset.labels[i])
                if target[0][i] >= 0.5:
                    actual.append(dataset.labels[i])
            for i in range(3):
                try:
                    if predicted[i] == actual[i]:
                        score+=1
                except IndexError:
                    score+=0
            score/=3
            correct+=score
            
            tepoch.set_postfix(loss=round(loss.item(),4), accuracy=100*correct//total)
            
        if loss.item() < old_loss:
            torch.save(model.state_dict(), path)
            old_loss = loss.item()
        
    print(f'Accuracy of Model on training set: {100*correct//total}%')

print('Finished Training')

Epoch 1:   3%|▎         | 14/473 [00:06<03:31,  2.17batch/s, accuracy=64, loss=0.181]


KeyboardInterrupt: 

In [350]:
# Create Confusion Matrix
model = NeuralNetwork()
model.load_state_dict(torch.load(path))
model.eval()

soln_cm = {}

with torch.no_grad():
        with tqdm(total=dataset.__len__()) as pbar:
            for idx in range(dataset.__len__()):
                pbar.set_description(f'Progress')
                pbar.update(1)

                dataset.test_flag = True
                data, labels = dataset.__getitem__(idx)
#                 print(labels)
#                 print(dataset.df_labels[idx])
#                 print(idx)

                outputs = model(torch.from_numpy(np.array([data])).float())

                lo = list(outputs[0])
                ll = labels.tolist()
                pred_st = dataset.labels[lo.index(max(lo[0:2]))]
                act_st = dataset.labels[ll.index(max(ll[0:2]))]
                ll[0] = 0
                ll[1] = 0
                
                try:
                    soln_cm[act_st][pred_st] +=1
                except KeyError:
                    if pred_st not in soln_cm[act_st]:
                        soln_cm[act_st][pred_st] = 0
                        
                pred_bs = dataset.labels[lo.index(max(lo[2:4]))]
                act_bs = dataset.labels[ll.index(max(ll[2:4]))]
                ll[2] = 0
                ll[3] = 0
                
                try:
                    soln_cm[act_bs][pred_bs] += 1
                except KeyError:
                    if pred_bs not in soln_cm[act_bs]:
                        soln_cm[act_bs][pred_bs] = 0

                pred_soln = dataset.labels[lo.index(max(lo[4:]))]
                act_soln = dataset.labels[ll.index(max(ll[4:]))]   
#                     print(ll)
#                     print(lo)
#                     print(labels)
#                     print(j)
#                     print(pred_soln, act_soln)
                if act_soln not in soln_cm:
                    soln_cm[act_soln] = {}
                if pred_soln not in soln_cm[act_soln]:
                    soln_cm[act_soln][pred_soln] = 0

                soln_cm[act_soln][pred_soln] +=1

print(soln_cm[soln])

Progress: 100%|██████████| 191338/191338 [1:00:43<00:00, 52.52it/s]

{'Background': 3, 'wet paprika': 483, 'wet red star yeast': 1}





In [351]:
soln_cm

{'Stable': {'Stable': 191338, 'Transitory': 0},
 'Transitory': {'Stable': 0, 'Transitory': 0},
 'Baseline': {'Baseline': 52501, 'Solution': 82825, 'Stable': 14},
 'Solution': {'Baseline': 183, 'Solution': 55346, 'Stable': 467},
 'Background': {'wet fleischmann jar yeast': 869,
  'wet tofu miso soup': 1338,
  'wet baby powder': 7974,
  'wet baking soda': 2733,
  'Background': 74996,
  'wet great value yeast': 3116,
  'wet pineapple gelatin': 1086,
  'wet flour': 1834,
  'wet red star yeast': 1155,
  'wet baking powder': 723,
  'wet niacin': 1630,
  'Cal. Soln': 734,
  'Riboflavin': 3472,
  'Distilled Water': 2013,
  'YG': 1295,
  'Hot Shot Confidence Check': 187,
  'Wet_BG': 3615,
  'dry turmeric': 95,
  'Baking Powder': 88,
  'dry red star yeast': 253,
  'dry pineapple gelatin': 974,
  'dry niacin': 781,
  'Baby Powder': 433,
  'dry milk powder': 258,
  'dry paprika': 408,
  'dry tofu miso soup': 338,
  'dry fleischmann jar yeast': 136,
  'wet milk powder': 411,
  'EH': 10371,
  'Conf.

In [352]:
soln_cm_pcts = soln_cm
for key in soln_cm_pcts.keys():
    total = 0
    for k in soln_cm_pcts[key].keys():
        total += soln_cm_pcts[key][k]
    for k in soln_cm_pcts[key].keys():
        try:
            soln_cm_pcts[key][k]=round(soln_cm_pcts[key][k]*100/total,2)
        except ZeroDivisionError:
            pass

In [353]:
soln_cm_pcts

{'Stable': {'Stable': 100.0, 'Transitory': 0.0},
 'Transitory': {'Stable': 0, 'Transitory': 0},
 'Baseline': {'Baseline': 38.79, 'Solution': 61.2, 'Stable': 0.01},
 'Solution': {'Baseline': 0.33, 'Solution': 98.84, 'Stable': 0.83},
 'Background': {'wet fleischmann jar yeast': 0.64,
  'wet tofu miso soup': 0.99,
  'wet baby powder': 5.89,
  'wet baking soda': 2.02,
  'Background': 55.41,
  'wet great value yeast': 2.3,
  'wet pineapple gelatin': 0.8,
  'wet flour': 1.36,
  'wet red star yeast': 0.85,
  'wet baking powder': 0.53,
  'wet niacin': 1.2,
  'Cal. Soln': 0.54,
  'Riboflavin': 2.57,
  'Distilled Water': 1.49,
  'YG': 0.96,
  'Hot Shot Confidence Check': 0.14,
  'Wet_BG': 2.67,
  'dry turmeric': 0.07,
  'Baking Powder': 0.07,
  'dry red star yeast': 0.19,
  'dry pineapple gelatin': 0.72,
  'dry niacin': 0.58,
  'Baby Powder': 0.32,
  'dry milk powder': 0.19,
  'dry paprika': 0.3,
  'dry tofu miso soup': 0.25,
  'dry fleischmann jar yeast': 0.1,
  'wet milk powder': 0.3,
  'EH': 

In [255]:
# to eval one at a time
score = 0
for i in range(10):
    test_loader = DataLoader(testset, batch_size=1, shuffle=True)
    dataiter = iter(test_loader)
    data, labels = dataiter.next()

    model = NeuralNetwork()
    model.load_state_dict(torch.load(path))
    model.eval()


    with torch.no_grad():

        outputs = model(data.float())
        
        lo = list(outputs[0])
        predicted = [dataset.labels[lo.index(max(lo[0:2]))],dataset.labels[lo.index(max(lo[2:4]))],dataset.labels[lo.index(max(lo[4:]))]]
        actual = []
        for i in range(len(labels[0])):
            if labels[0][i] == 1:
                actual.append(dataset.labels[i])
                
        for i in range(3):
            try:
                if predicted[i] == actual[i]:
                    score+=1
            except IndexError:
                score+=0
        #print(score)
        print('Predicted: ', *predicted)
        print('Acutal:    ', *actual,'\n')
        #print(score)
    time.sleep(1)

Predicted:  Stable Baseline Background
Acutal:     Stable Baseline Background 

Predicted:  Stable Baseline Background
Acutal:     Stable Baseline Background 

Predicted:  Stable Baseline Background
Acutal:     Stable Baseline Background 

Predicted:  Stable Baseline Background
Acutal:     Stable Baseline Background 

Predicted:  Stable Baseline Background
Acutal:     Stable Baseline Background 

Predicted:  Stable Baseline Background
Acutal:     Stable Baseline Background 

Predicted:  Stable Baseline Background
Acutal:     Stable Baseline Background 

Predicted:  Stable Baseline Background
Acutal:     Stable Baseline Background 

Predicted:  Stable Solution Wet_BG
Acutal:     Stable Solution wet niacin 

Predicted:  Stable Baseline Background
Acutal:     Stable Baseline Background 



In [194]:
# to eval entire dataset
test_loader = DataLoader(testset, batch_size=64, shuffle=True)
dataiter = iter(test_loader)
model = NeuralNetwork()
model.load_state_dict(torch.load(path))
model.eval()
tc = 0
t = 0
with torch.no_grad():
    for j in range(len(dataiter)):

        data, labels = dataiter.next()

        outputs = model(data.float())
        # print(outputs)
        # print(labels)

        _, predicted = torch.max(outputs.data, 1)
        _, actual = torch.max(labels, 1)
    #     print('Predicted', predicted)
    #     print('Acutal', actual)
        correct = (predicted==actual).sum().item()
        tc+=correct
        t+=len(actual)
    #print(correct*100//labels.size(0))
print(f'Accuracy: {tc*100//t} %')

KeyboardInterrupt: 

In [19]:
# eval each soln
model = NeuralNetwork()
model.load_state_dict(torch.load(path))
model.eval()

soln_idxs = dataset.label_data
for soln in dataset.labels:
    if soln == 'Background':
        pass
    else:
        preds = {}
        tc = 0
        t = 0
        with torch.no_grad():
            with tqdm(total=len(soln_idxs[soln]))
            print(soln)
            for j in soln_idxs[soln]:
                dataset.test_flag = True
                data, labels = dataset.__getitem__(j)

                outputs = model(torch.from_numpy(np.array([data])).float())
                # print(outputs)
                # print(labels)
                
                
                _, predicted = torch.max(outputs.data, 1)
                _, actual = torch.max(torch.from_numpy(np.array([labels])), 1)
            #     print('Predicted', predicted)
            #     print('Acutal', actual)
                
                correct = (predicted==actual).sum().item()
                tc+=correct
                t+=len(actual)
            #print(correct*100//labels.size(0))
        print(f'{soln} Accuracy: {tc*100//t} %')    

wet paprika
wet paprika Accuracy: 0 %
dry turmeric
dry turmeric Accuracy: 0 %
Riboflavin
Riboflavin Accuracy: 96 %
Wet_BG
Wet_BG Accuracy: 98 %
YG
YG Accuracy: 95 %
wet B12
wet B12 Accuracy: 0 %
Baby Powder
Baby Powder Accuracy: 100 %
dry niacin
dry niacin Accuracy: 55 %
Red Star Yeast
Red Star Yeast Accuracy: 93 %
wet red star yeast
wet red star yeast Accuracy: 95 %
Confidence Check
Confidence Check Accuracy: 99 %
Hot Shot Confidence Check
Hot Shot Confidence Check Accuracy: 98 %
Baking Powder
Baking Powder Accuracy: 98 %
Distilled Water
Distilled Water Accuracy: 96 %
wet niacin
wet niacin Accuracy: 88 %
wet tofu miso soup
wet tofu miso soup Accuracy: 94 %
