In [352]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from tqdm.auto import tqdm
from timeit import default_timer as timer

In [121]:
def print_train_time(start, end):
    print(f"\ntrain time: {end - start}")

In [60]:
data = pd.read_csv("drug_consumption.csv")

In [61]:
df = data.copy()

In [62]:
df = df.drop(columns=['ID'])

In [63]:
df.columns

Index(['Age', 'Gender', 'Education', 'Country', 'Ethnicity', 'Nscore',
       'Escore', 'Oscore', 'Ascore', 'Cscore', 'Impulsive', 'SS', 'Alcohol',
       'Amphet', 'Amyl', 'Benzos', 'Caff', 'Cannabis', 'Choc', 'Coke', 'Crack',
       'Ecstasy', 'Heroin', 'Ketamine', 'Legalh', 'LSD', 'Meth', 'Mushrooms',
       'Nicotine', 'Semer', 'VSA'],
      dtype='object')

In [64]:
df.dtypes

Age          float64
Gender       float64
Education    float64
Country      float64
Ethnicity    float64
Nscore       float64
Escore       float64
Oscore       float64
Ascore       float64
Cscore       float64
Impulsive    float64
SS           float64
Alcohol       object
Amphet        object
Amyl          object
Benzos        object
Caff          object
Cannabis      object
Choc          object
Coke          object
Crack         object
Ecstasy       object
Heroin        object
Ketamine      object
Legalh        object
LSD           object
Meth          object
Mushrooms     object
Nicotine      object
Semer         object
VSA           object
dtype: object

In [65]:
features = df.select_dtypes(include=['number']).columns
labels = df.select_dtypes(include=['object']).columns

In [66]:
#{i:len(df[i].value_counts()) for i in num_cols}
cat_cols = [i for i in features if len(df[i].value_counts()) < 10]
cat_cols

['Age', 'Gender', 'Education', 'Country', 'Ethnicity']

### Age

In [67]:
start = 24
temp = [[18 if i == 0 else start + 10*(i-1) + 1, start + 10*i] for i in range(5)]
age_groups = [f"{i[0]} - {i[1]}" for i in temp]
age_groups.append("65+")
Age_map = {j:age_groups[i] for i, j in enumerate(sorted(df[cat_cols[0]].unique()))}
Age_map

{-0.95197: '18 - 24',
 -0.07854: '25 - 34',
 0.49788: '35 - 44',
 1.09449: '45 - 54',
 1.82213: '55 - 64',
 2.59171: '65+'}

### Gender

In [68]:
Gender_map = dict(zip(df[cat_cols[1]].unique(), ["Female", "Male"]))
Gender_map

{0.48246: 'Female', -0.48246: 'Male'}

### Education

In [69]:
education_groups = ["Left School Before 16 years",
"Left School at 16 years",
"Left School at 17 years",
"Left School at 18 years",
"Some College,No Certificate Or Degree",
"Professional Certificate/ Diploma",
"University Degree",
"Masters Degree",
"Doctorate Degree"]
Education_map = dict(zip(sorted(df[cat_cols[2]].unique()), education_groups))
Education_map

{-2.43591: 'Left School Before 16 years',
 -1.7379: 'Left School at 16 years',
 -1.43719: 'Left School at 17 years',
 -1.22751: 'Left School at 18 years',
 -0.61113: 'Some College,No Certificate Or Degree',
 -0.05921: 'Professional Certificate/ Diploma',
 0.45468: 'University Degree',
 1.16365: 'Masters Degree',
 1.98437: 'Doctorate Degree'}

### Country

In [70]:
Country_map = {-0.09765 : "Australia",
0.24923 : "Canada",
-0.46841 : "New Zealan",
-0.28519 : "Other",
0.21128 : "Republic of Ireland",
0.96082 : "UK",
-0.57009 : "USA"}
Country_map

{-0.09765: 'Australia',
 0.24923: 'Canada',
 -0.46841: 'New Zealan',
 -0.28519: 'Other',
 0.21128: 'Republic of Ireland',
 0.96082: 'UK',
 -0.57009: 'USA'}

### Ethnicity

In [71]:
Ethnicity_map  = {-0.50212 : "Asian",
-1.10702 : "Black",
1.90725 : "Mixed-Black/Asian",
0.12600 : "Mixed-White/Asian",
-0.22166 : "Mixed-White/Black",
0.11440 : "Other",
-0.31685 : "White"}
Ethnicity_map

{-0.50212: 'Asian',
 -1.10702: 'Black',
 1.90725: 'Mixed-Black/Asian',
 0.126: 'Mixed-White/Asian',
 -0.22166: 'Mixed-White/Black',
 0.1144: 'Other',
 -0.31685: 'White'}

In [72]:
Label_map = {"CL0" : "Never Used",
"CL1" : "Used over a Decade Ago",
"CL2" : "Used in Last Decade",
"CL3" : "Used in Last Year",
"CL4" : "Used in Last Month",
"CL5" : "Used in Last Week",
"CL6" : "Used in Last Day"}
Label_map

{'CL0': 'Never Used',
 'CL1': 'Used over a Decade Ago',
 'CL2': 'Used in Last Decade',
 'CL3': 'Used in Last Year',
 'CL4': 'Used in Last Month',
 'CL5': 'Used in Last Week',
 'CL6': 'Used in Last Day'}

In [73]:
maps = [Age_map, Gender_map, Education_map, Country_map, Ethnicity_map]

for i, j in enumerate(maps):
    df[cat_cols[i]] = df[cat_cols[i]].map(j)

In [74]:
df = df.drop("Semer", axis=1)

In [80]:
np.unique(df["Age"])

array(['18 - 24', '25 - 34', '35 - 44', '45 - 54', '55+'], dtype=object)

In [79]:
df["Age"] = df["Age"].replace({'65+' : '55 - 64'})
df["Age"] = df["Age"].replace({'55 - 64' : '55+'})

In [81]:
df.dtypes

Age           object
Gender        object
Education     object
Country       object
Ethnicity     object
Nscore       float64
Escore       float64
Oscore       float64
Ascore       float64
Cscore       float64
Impulsive    float64
SS           float64
Alcohol       object
Amphet        object
Amyl          object
Benzos        object
Caff          object
Cannabis      object
Choc          object
Coke          object
Crack         object
Ecstasy       object
Heroin        object
Ketamine      object
Legalh        object
LSD           object
Meth          object
Mushrooms     object
Nicotine      object
VSA           object
dtype: object

In [82]:
## df.to_csv("drug.csv")

In [83]:
score_cols = df.iloc[:, 5:(5+7)].columns
demo_cols = df.iloc[:, :5].columns
drug_cols = df.iloc[:, 12:].columns

## multi_label_demo

In [90]:
X = df[score_cols]
y = df[drug_cols].replace(dict(zip(np.unique(y), np.array([0, 0, 0, 1, 1, 1, 1]))))

  y = df[drug_cols].replace(dict(zip(np.unique(y), np.array([0, 0, 0, 1, 1, 1, 1]))))


In [97]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [98]:
X_train.shape, X_test.shape

((1508, 7), (377, 7))

In [375]:
class MLP(nn.Module):
    def __init__(self, input_size, output_size, hidden_units):
        super().__init__()
        self.layer1 = nn.Linear(input_size, hidden_units)
        self.layer2 = nn.Linear(hidden_units, hidden_units)
        self.output = nn.Linear(hidden_units, output_size)

    def forward(self, x):
        x = self.layer1(x)
        x = torch.relu(x)
        x = self.layer2(x)
        x = torch.relu(x)
        x = self.output(x)
        x = torch.sigmoid(x)
        return x
    

In [376]:
X_train

Unnamed: 0,Nscore,Escore,Oscore,Ascore,Cscore,Impulsive,SS
1814,-0.05188,-1.23177,0.29338,1.11406,0.25953,0.88113,0.76540
710,0.52135,0.32197,0.72330,0.13136,1.13407,-0.21712,-0.52593
931,-0.24649,0.00332,1.06238,0.59042,2.04506,-0.71126,-0.52593
617,-0.05188,1.11406,0.88309,0.13136,-0.00665,0.52975,0.07987
1797,2.28554,-1.23177,-0.01928,-2.53830,-2.90161,-0.21712,1.22470
...,...,...,...,...,...,...,...
1130,0.62967,-0.69509,0.14143,-1.21213,-0.27607,-0.21712,-0.21575
1294,-0.34799,0.00332,0.58331,-0.15487,-0.52745,0.19268,1.22470
860,0.04257,-0.15487,-0.17779,-0.60633,-0.40581,1.86203,1.22470
1459,0.13606,0.16767,-0.45174,0.43852,0.12331,-0.71126,0.40148


In [377]:
X_train_tensor = torch.tensor(X_train.to_numpy(), dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.to_numpy(), dtype=torch.float32)

X_test_tensor = torch.tensor(X_test.to_numpy(), dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.to_numpy(), dtype=torch.float32)

In [378]:
X_train_tensor.dtype, y_train_tensor.dtype

(torch.float32, torch.float32)

In [379]:
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=True)

In [380]:
next(iter(train_dataset))[0].dtype

torch.float32

In [381]:
next(iter(train_dataloader))[0].dtype, next(iter(train_dataloader))[1].dtype

(torch.float32, torch.float32)

In [382]:
input_shape = X_train.shape[1]
output_shape = y_train.shape[1]

In [389]:
model = MLP(input_shape, output_shape, 30)

In [390]:
model

MLP(
  (layer1): Linear(in_features=7, out_features=30, bias=True)
  (layer2): Linear(in_features=30, out_features=30, bias=True)
  (output): Linear(in_features=30, out_features=18, bias=True)
)

In [391]:
loss_fn = nn.BCELoss()
optimizer = optim.Adam(model.parameters())

In [392]:
def accuracy(y_true, y_pred):
    y_pred = y_pred.round()
    correct = (y_pred == y_true).float()  # Get a tensor of 1s and 0s
    acc = correct.sum() / correct.numel()  # Mean of correct predictions
    return acc

In [393]:
torch.manual_seed(42)
start = timer()

epochs = 100

for epoch in tqdm(range(epochs)):
    print(f"Epoch: {epoch} \n ---------")

    train_loss, train_acc = 0, 0

    for batch, (X, y) in enumerate(train_dataloader):
        y_pred = model(X)

        loss = loss_fn(y_pred, y)

        train_acc += accuracy(y, y_pred)

        train_loss += loss

        optimizer.zero_grad()

        loss.backward()

        optimizer.step()

        if batch % 10 == 0:
            print(f"looked at {batch * len(X)} / {len(train_dataloader.dataset)} samples")

    train_loss /= len(train_dataloader)
    train_acc /= len(train_dataloader)

    test_loss, test_acc = 0, 0

    model.eval()

    with torch.inference_mode():
        for X, y, in test_dataloader:

            test_pred = model(X)

            test_acc += accuracy(y, test_pred)
            test_loss += loss_fn(test_pred, y)

        test_loss /= len(test_dataloader)
        test_acc /= len(test_dataloader)

    print(f"\ntrain loss: {train_loss} | train acc: {train_acc} | val loss: {test_loss} | val acc: {test_acc}\n")

            
end = timer()
print_train_time(start, end)

  0%|          | 0/100 [00:00<?, ?it/s]

Epoch: 0 
 ---------
looked at 0 / 1508 samples
looked at 320 / 1508 samples
looked at 640 / 1508 samples
looked at 960 / 1508 samples
looked at 1280 / 1508 samples

train loss: 0.6272293925285339 | train acc: 0.7055122256278992 | val loss: 0.5290291905403137 | val acc: 0.8164758086204529

Epoch: 1 
 ---------
looked at 0 / 1508 samples
looked at 320 / 1508 samples
looked at 640 / 1508 samples
looked at 960 / 1508 samples
looked at 1280 / 1508 samples

train loss: 0.45716598629951477 | train acc: 0.8192636370658875 | val loss: 0.4159373939037323 | val acc: 0.8303298950195312

Epoch: 2 
 ---------
looked at 0 / 1508 samples
looked at 320 / 1508 samples
looked at 640 / 1508 samples
looked at 960 / 1508 samples
looked at 1280 / 1508 samples

train loss: 0.3993009328842163 | train acc: 0.8334056735038757 | val loss: 0.3951549828052521 | val acc: 0.8285995125770569

Epoch: 3 
 ---------
looked at 0 / 1508 samples
looked at 320 / 1508 samples
looked at 640 / 1508 samples
looked at 960 / 1508