In [1]:
import pandas as pd 
import numpy as np 
import torch 

In [2]:
train_dict = torch.load("train_pv_xrd.pt")
val_dict = torch.load("val_pv_xrd.pt")
#test_dict = torch.load("test_pv_xrd.pt")

In [3]:
train_df = pd.read_csv("train.csv")
val_df = pd.read_csv("val.csv")
#test_df = pd.read_csv("test.csv")

In [4]:
def df_to_spacegroup_data(df): 
    spacegroup_number = np.array(df['spacegroup.number'])
    spacegroup_number_one_hot_encoded = np.eye(230)[spacegroup_number]
    
    return torch.tensor(spacegroup_number_one_hot_encoded)

In [5]:
data_dict = {
    'train': (train_dict, train_df),
    'val': (val_dict, val_df), 
#    'test': (test_dict, test_df)
}

In [6]:
def df_to_spacegroup_data(df): 
    spacegroup_number = np.array(df['spacegroup.number'])
    spacegroup_number_one_hot_encoded = np.eye(230)[spacegroup_number]
    
    return torch.tensor(spacegroup_number_one_hot_encoded)

In [7]:
training_sgs = torch.tensor(train_df['spacegroup.number'], dtype=torch.long)
val_sgs = torch.tensor(val_df['spacegroup.number'], dtype=torch.long)
#test_sgs = df_to_spacegroup_data(test_df)

In [8]:
def convert_to_tensor(pseudo_voight_dict, df): 
    list_of_pseudo_voights = []
    for key in df['material_id']:
        key += "_0"
        value = pseudo_voight_dict[key]
        list_of_pseudo_voights.append(value)

    tensor_of_pseudo_voights = torch.tensor(torch.stack(list_of_pseudo_voights))
    return(tensor_of_pseudo_voights)

In [9]:
training_pvs = convert_to_tensor(train_dict, train_df)
val_pvs = convert_to_tensor(val_dict, val_df)
#test_pvs = convert_to_tensor(test_dict)

  tensor_of_pseudo_voights = torch.tensor(torch.stack(list_of_pseudo_voights))


In [10]:
training_sgs.shape

torch.Size([36006])

In [11]:
training_pvs.shape

torch.Size([36006, 1, 8500])

In [12]:
val_sgs.shape

torch.Size([4523])

In [13]:
import ast 

In [14]:
train_df['atomic_numbers'] = train_df['atomic_numbers'].apply(ast.literal_eval)
val_df['atomic_numbers'] = val_df['atomic_numbers'].apply(ast.literal_eval)

In [15]:
import numpy as np

def create_multi_hot_embedding(input_list, max_value = 99):
    # Initialize an array of zeros with length max_value + 1
    multi_hot_embedding = np.zeros(max_value + 1, dtype=int)
    
    #subtract out 1 to index from 0 (hydrogen)
    input_list = [value - 1 for value in input_list]

    # Set the corresponding index for each number in the input list to 1
    for num in input_list:
        multi_hot_embedding[num] += 1
    
    return multi_hot_embedding

In [16]:
train_df['composition_multihot'] = train_df['atomic_numbers'].apply(create_multi_hot_embedding)
val_df['composition_multihot'] = val_df['atomic_numbers'].apply(create_multi_hot_embedding)

In [17]:
train_composition = torch.tensor(train_df['composition_multihot'])
val_composition = torch.tensor(val_df['composition_multihot'])
train_composition = train_composition.unsqueeze(1)
val_composition = val_composition.unsqueeze(1)

In [21]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
from torch.utils.data import random_split

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [19]:
training_pvs = training_pvs.to(device)
training_sgs = training_sgs.to(device)
val_pvs = val_pvs.to(device)
val_sgs = val_sgs.to(device)

train_composition = train_composition.to(device)
val_composition = val_composition.to(device)

In [20]:
train_xrd_comp = torch.cat((training_pvs, train_composition), dim = 2)
val_xrd_comp = torch.cat((val_pvs, val_composition), dim = 2)

In [21]:
train_dataset = TensorDataset(train_xrd_comp, training_sgs)
val_dataset = TensorDataset(val_xrd_comp, val_sgs)

In [22]:
# Create DataLoaders for train and validation sets
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
valid_loader = DataLoader(val_dataset, batch_size=256, shuffle=False)  

In [23]:
# Define the neural network with MLP at the end
class SimpleConvNet(nn.Module):
    def __init__(self, in_channels, output_dim):
        super(SimpleConvNet, self).__init__()
        self.conv_layers = nn.Sequential(
            nn.Conv1d(in_channels, 80, kernel_size = 100, stride=5),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Conv1d(80, 80, 50, stride=5),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Conv1d(80, 80, 25, stride=2),
            nn.ReLU(),
            nn.Dropout(0.3),
        )
        self.flatten = nn.Flatten()
        # Calculate flattened_size dynamically
        self.flattened_size = self._get_flattened_size(input_shape=(1, in_channels, 8500))
        self.MLP = nn.Sequential(
            nn.Linear(self.flattened_size, 2300),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(2300, 1150),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(1150, output_dim)
        )

        self.composition_net = nn.Sequential(
            nn.Linear(100, 5),
            nn.ReLU(),
            nn.Dropout(0.5),
        )

        self.merge_net = nn.Sequential(
            nn.Linear(235, 230),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(230, 230),
        )

    def _get_flattened_size(self, input_shape):
        dummy_input = torch.zeros(input_shape)
        with torch.no_grad():
            dummy_output = self.conv_layers(dummy_input)
        return int(np.prod(dummy_output.shape))

    def forward(self, input):
        x = input[:, :, :8500] # x is "xrd"
        
        c = input[:, :, 8500:] # rest is the composition. will have dimension N x 1 x 100. done for convenience in concating to the xrd 
        c = c.squeeze(1) # now it's N x 100 so can treat like "normal" mlp input 

        x = self.conv_layers(x)
        x = self.flatten(x)
        x = self.MLP(x)

        c = self.composition_net(c)

        m = torch.cat((x, c), dim = 1) # dimension of x is now N x 230. dim of c is N x 5.
        m = self.merge_net(m)

        return x
    
# Create the model instance and move it to the selected device
output_dim = 230  # Output dimension
model = SimpleConvNet(in_channels=1, output_dim=output_dim).to(device)

# Define optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=0.0002)
criterion = nn.CrossEntropyLoss()

In [24]:
for epoch in range(1000):
    model.train()  # Set the model to training mode
    total_train_loss = 0
    correct_train = 0
    total_train = 0

    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        total_train_loss += loss.item()

        _, predicted = torch.max(outputs.data, 1)
        total_train += targets.size(0)
        correct_train += (predicted == targets).sum().item()

    train_accuracy = 100 * correct_train / total_train

    # Validation phase
    model.eval()  # Set the model to evaluation mode
    total_valid_loss = 0
    correct_valid = 0
    total_valid = 0

    with torch.no_grad():  # No gradients needed for validation
        for inputs, targets in valid_loader:
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            total_valid_loss += loss.item()

            _, predicted = torch.max(outputs.data, 1) 
            total_valid += targets.size(0)
            correct_valid += (predicted == targets).sum().item()

    valid_accuracy = 100 * correct_valid / total_valid

    print(f"Epoch {epoch+1}, Training Loss: {total_train_loss / len(train_loader)}, Training Accuracy: {train_accuracy}%, Validation Loss: {total_valid_loss / len(valid_loader)}, Validation Accuracy: {valid_accuracy}%")

Epoch 1, Training Loss: 3.988490551076037, Training Accuracy: 14.872521246458923%, Validation Loss: 3.5454098516040378, Validation Accuracy: 20.45102807870882%
Epoch 2, Training Loss: 3.2660929480343, Training Accuracy: 24.115425206909958%, Validation Loss: 2.9743677775065103, Validation Accuracy: 29.338934335617953%
Epoch 3, Training Loss: 2.861892248721833, Training Accuracy: 31.44753652169083%, Validation Loss: 2.6048078404532538, Validation Accuracy: 37.12137961529958%
Epoch 4, Training Loss: 2.582221078534498, Training Accuracy: 37.43265011386991%, Validation Loss: 2.3483763800726996, Validation Accuracy: 43.02454123369445%
Epoch 5, Training Loss: 2.327994478509781, Training Accuracy: 42.867855357440426%, Validation Loss: 2.134657171037462, Validation Accuracy: 47.13685606898076%
Epoch 6, Training Loss: 2.142900915010601, Training Accuracy: 46.375604065989%, Validation Loss: 1.98390926917394, Validation Accuracy: 50.82909573292063%
Epoch 7, Training Loss: 1.9809799059062985, Train