In [139]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split, TensorDataset
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
import pandas as pd

In [140]:
personality_smoke_data = pd.read_csv('personality_traits.csv')
display(personality_smoke_data.head())

Unnamed: 0,extroversion,critical,dependable,anxious,open,reserved,sympathetic,disorganized,calm,conventional,control
0,Agree strongly,Disagree moderately,Agree strongly,Agree moderately,Agree strongly,Agree a little,Agree strongly,Disagree a little,Agree a little,Disagree strongly,No
1,Agree moderately,Agree moderately,Agree moderately,Agree a little,Agree strongly,Disagree strongly,Agree strongly,Neither agree nor disagree,Agree strongly,Neither agree nor disagree,No
2,Disagree moderately,Neither agree nor disagree,Agree a little,Agree moderately,Disagree a little,Agree moderately,Neither agree nor disagree,Agree a little,Neither agree nor disagree,Agree moderately,No
3,Agree strongly,Disagree moderately,Agree strongly,Disagree strongly,Agree strongly,Disagree moderately,Agree moderately,Disagree strongly,Neither agree nor disagree,Disagree strongly,No
4,Agree a little,Agree a little,Agree strongly,Disagree moderately,Agree moderately,Agree strongly,Neither agree nor disagree,Disagree moderately,Agree strongly,Disagree strongly,No


# Objective

We will try to predict, based on personality traits alone using deep neural networks, if a certain individual is in control of his or her smoking habits (based on the given data, all survey participants are smokers. We refer to the column "Do you find it difficult to refrain from smoking where it is forbidden (church, library, cinema, plane, etc...)?" in the original data). We will observe the accuracy of our model to determine if personality traits are alone to predict control of smoking habits. We will also use other data related to a given individual (age, gender, number of smoking friends, etc...) to develop another model and compare the performance between the two.

Due to difficulties in manipulating the given .xls file due to its format in Python, we instead performed some manipulation for the dataset file. We have only included the relevant columns of the personality of individuals as well as the control variable in the truncated dataset "personality_traits.csv"

We have changed the values of the attributes to be intelligible by the neural network through modeling the values of disagreement and agreement to range between [-3, 3] representing person's dis/agreement's affirmation, then we have deducted useless features such as "Have you smoked at least one full tobacco cigarette (excluding e-cigarettes) once or more in the past 30 days?" since all of its values are "Yes". Afterwards, we discovered there are empty values in the dataset that are not specified by the individuals so we have resorted to use default value for them since the dataset size is small already.

In [160]:
converted_data = personality_smoke_data.copy()

likert_mapping = {
    "Agree strongly": 3.0,
    "Agree moderately": 2.0,
    "Agree a little": 1.0,
    "Neither agree nor disagree": 0.0,
    "Disagree a little": -1.0,
    "Disagree moderately": -2.0,
    "Disagree strongly": -3.0,
    "": 0.0
}

converted_data[['extroversion', 'critical', 'dependable', 'anxious', 'open', 'reserved', 'sympathetic', 'disorganized', 'calm', 'conventional']] = converted_data[['extroversion', 'critical', 'dependable', 'anxious', 'open', 'reserved', 'sympathetic', 'disorganized', 'calm', 'conventional']].map(lambda x: likert_mapping.get(x, x)).fillna(0.0)
converted_data[['control']] = converted_data[['control']].map(lambda x: 0 if x == 'Yes' else 1)
display(converted_data.head())

Unnamed: 0,extroversion,critical,dependable,anxious,open,reserved,sympathetic,disorganized,calm,conventional,control
0,3.0,-2.0,3.0,2.0,3.0,1.0,3.0,-1.0,1.0,-3.0,1
1,2.0,2.0,2.0,1.0,3.0,-3.0,3.0,0.0,3.0,0.0,1
2,-2.0,0.0,1.0,2.0,-1.0,2.0,0.0,1.0,0.0,2.0,1
3,3.0,-2.0,3.0,-3.0,3.0,-2.0,2.0,-3.0,0.0,-3.0,1
4,1.0,1.0,3.0,-2.0,2.0,3.0,0.0,-2.0,3.0,-3.0,1


In [142]:
display(converted_data['critical'].isna().sum())

0

In [143]:
control = converted_data['control']
display(control.head())

0    1
1    1
2    1
3    1
4    1
Name: control, dtype: int64

# Personality-Smoke Control Model

In [161]:
X_tensor = torch.tensor(converted_data.drop(columns=['control']).values, dtype=torch.float32)
y_tensor = torch.tensor(control.values, dtype=torch.float32)

personality_smoke_tensor_dataset = TensorDataset(X_tensor, y_tensor)

In [145]:
# Split the dataset into train, dev, and test sets
train_size = int(0.75 * len(personality_smoke_tensor_dataset))  # 75% for training
dev_size = int(0.15 * len(personality_smoke_tensor_dataset))  # 10% for validation (dev)
test_size = len(personality_smoke_tensor_dataset) - train_size - dev_size  # Remaining 10% for testing

train_data, dev_data, test_data = random_split(personality_smoke_tensor_dataset, [train_size, dev_size, test_size])

batch_size = 64

# Create DataLoader for train, dev, and test sets
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
dev_loader = DataLoader(dev_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True)

In [153]:
class simpleNN(nn.Module):
    def __init__(self):
        super(simpleNN, self).__init__()
        self.fc1 = nn.Linear(10, 64)  # Input layer
        self.fc2 = nn.Linear(64, 32)  # Hidden layer
        self.fc3 = nn.Linear(32, 1)  # Output layer 
        self.dropout = nn.Dropout(0.5)
        self.relu = nn.LeakyReLU()
        self.sigmoid = nn.Sigmoid()
        self.bn1 = nn.BatchNorm1d(64)
        self.bn2 = nn.BatchNorm1d(32)
  
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.bn1(x)
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.bn2(x)
        x = self.dropout(x)
        x = self.sigmoid(self.fc3(x))
        return x

In [162]:
# Instantiate the model, loss function, and optimizer
model = simpleNN()
criterion = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Training loop
epochs = 201

for epoch in range(epochs):
    model.train()  # Set the model to training mode
    running_loss = 0

    for features, labels in train_loader:
        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(features) 
        loss = criterion(outputs.squeeze(), labels)

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    # Print loss for the epoch
    if epoch % 100 == 0:
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {running_loss/len(train_loader):.4f}')
        

Epoch [1/201], Loss: 0.7236
Epoch [101/201], Loss: 0.6057
Epoch [201/201], Loss: 0.6171


## Accuracy Measurements

In [163]:
import torch.nn.functional as F

def evaluate_model(loader):
    model.eval()  # Set the model to evaluation mode
    total, correct = 0, 0
    with torch.no_grad():  # Disable gradient calculation
        for features, labels in loader:
            outputs = model(features)
            predicted = (outputs.squeeze() > 0.5).float()            
            total += labels.size(0) 
            correct += (predicted == labels.squeeze()).sum().item()
    return correct / total

accuracy = evaluate_model(train_loader)
print(f'Train Accuracy: {accuracy * 100:.2f}%')
# Evaluate on dev set
accuracy = evaluate_model(dev_loader)
print(f'Validation Accuracy: {accuracy * 100:.2f}%')

Train Accuracy: 74.84%
Validation Accuracy: 61.29%


In [164]:
# Evaluate on test set
accuracy = evaluate_model(test_loader)
print(f'Test Accuracy: {accuracy * 100:.2f}%')

Test Accuracy: 54.55%


We deployed neural network to predict if the person has control over himself to not smoke if he wants to or not by using the personal traits attributes of the person and interpreting the output from the feature of whether he find it difficult to refrain from smoking where it is forbidden. We have manipulated and tuned with the parameters such as dropouts and batch de-normalization, etc. until we got satisfied with what we got, following the machine learning workflow technique we learned at the beginning of the semester, First our model train accuracy has reached 100%, which means over fitting with a low validation accuracy of around 50%, however when we increased the dropout value the train accuracy was decreasing while the accuracy of validation was increasing at the same time until we tuned such that both accuracy are almost the same value. 

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=6765d258-39ce-4a8b-b9bb-8aeaec5d46e2' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>