In [11]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format='retina'

from sklearn.model_selection import train_test_split

# Set a random seed to ensure reproducibility across runs
RNG_SEED = 42
np.random.seed(seed=RNG_SEED)

In [12]:
PATH = os.getcwd()
data_path = os.path.join(PATH, './zt_data_cleaned.csv')

df = pd.read_csv(data_path)
print(f'Full DataFrame shape: {df.shape}')

Full DataFrame shape: (152, 5)


In [13]:
df.head()

Unnamed: 0,formula,k,c,s,zt
0,Bi1Cu1Se1O1,1.0,1.0,1.0,1.0
1,Bi0.98Sn0.02Cu1Se1O1,1.185518,0.914338,0.992608,0.759896
2,Bi0.96Sn0.04Cu1Se1O1,1.389035,0.955253,1.025874,0.723758
3,Bi0.94Sn0.06Cu1Se1O1,1.221515,0.617039,1.170023,0.691516
4,Bi0.92Sn0.08Cu1Se1O1,1.230375,0.746489,1.073924,0.699733


In [14]:
X = df[['formula', 'zt', 'k','s']]
y = df['c']

print(f'Shape of X: {X.shape}')
print(f'Shape of y: {y.shape}')
X.head()

Shape of X: (152, 4)
Shape of y: (152,)


Unnamed: 0,formula,zt,k,s
0,Bi1Cu1Se1O1,1.0,1.0,1.0
1,Bi0.98Sn0.02Cu1Se1O1,0.759896,1.185518,0.992608
2,Bi0.96Sn0.04Cu1Se1O1,0.723758,1.389035,1.025874
3,Bi0.94Sn0.06Cu1Se1O1,0.691516,1.221515,1.170023
4,Bi0.92Sn0.08Cu1Se1O1,0.699733,1.230375,1.073924


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=RNG_SEED)

print(X_train.shape)
print(X_test.shape)

(121, 4)
(31, 4)


In [16]:
num_rows = len(X_train)
print(f'There are in total {num_rows} rows in the X_train DataFrame.')

num_unique_formulae = len(X_train['formula'].unique())
print(f'But there are only {num_unique_formulae} unique formulae!\n')

print('Unique formulae and their number of occurances in the X_train DataFrame:')
print(X_train['formula'].value_counts(), '\n')
print('Unique formulae and their number of occurances in the X_test DataFrame:')
print(X_test['formula'].value_counts())

There are in total 121 rows in the X_train DataFrame.
But there are only 109 unique formulae!

Unique formulae and their number of occurances in the X_train DataFrame:
Bi0.900Ca0.100Cu1Se1O1    3
Bi0.925Ca0.075Cu1Se1O1    3
Bi0.92Pb0.08Cu1Se1O1      3
Bi0.925Mg0.075Cu1Se1O1    2
Bi0.92Na0.08Cu1Se1O1      2
                         ..
Bi0.94Ag0.06Cu1Se1O1      1
Bi0.96Ho0.04Cu1Se1O1      1
Bi0.90K0.10Cu1Se1O1       1
Bi0.90Y0.10Cu1Se1O1       1
Bi0.94Yb0.06Cu1Se1O1      1
Name: formula, Length: 109, dtype: int64 

Unique formulae and their number of occurances in the X_test DataFrame:
Bi0.96Pb0.04Cu1Se1O1          2
Bi0.98K0.02Cu1Se1O1           1
Bi0.94Pb0.06Cu1Se1O1          1
Bi0.98Na0.02Cu1Se1O1          1
Bi0.96Ca0.02Pb0.02Cu1Se1O1    1
Bi0.94Na0.06Cu1Se1O1          1
Bi0.950In0.050Cu1Se1O1        1
Bi0.90Na0.10Cu1Se1O1          1
Bi0.96Na0.04Cu1Se1O1          1
Bi0.850Mg0.150Cu1Se1O1        1
Bi0.88Na0.12Cu1Se1O1          1
Bi0.92Yb0.08Cu1Se1O1          1
Bi0.92Ho0.08Cu1Se1O1     

In [17]:
unique_formulae = X['formula'].unique()
print(f'{len(unique_formulae)} unique formulae:\n{unique_formulae}')


132 unique formulae:
['Bi1Cu1Se1O1' 'Bi0.98Sn0.02Cu1Se1O1' 'Bi0.96Sn0.04Cu1Se1O1'
 'Bi0.94Sn0.06Cu1Se1O1' 'Bi0.92Sn0.08Cu1Se1O1' 'Bi0.99Cd0.01Cu1Se1O1'
 'Bi0.95Cd0.05Cu1Se1O1' 'Bi0.90Cd0.10Cu1Se1O1' 'Bi0.995Sb0.005Cu1Se1O1'
 'Bi0.990Sb0.010Cu1Se1O1' 'Bi0.980Sb0.020Cu1Se1O1'
 'Bi0.950Sb0.050Cu1Se1O1' 'Bi0.920Sb0.080Cu1Se1O1' 'Bi0.98Er0.02Cu1Se1O1'
 'Bi0.96Er0.04Cu1Se1O1' 'Bi0.94Er0.06Cu1Se1O1' 'Bi0.92Er0.08Cu1Se1O1'
 'Bi0.90Er0.10Cu1Se1O1' 'Bi0.96Pb0.04Cu1Se1O1' 'Bi0.94Pb0.06Cu1Se1O1'
 'Bi0.92Pb0.08Cu1Se1O1' 'Bi0.975Mg0.25Cu1Se1O1' 'Bi0.95Mg0.05Cu1Se1O1'
 'Bi0.925Mg0.075Cu1Se1O1' 'Bi0.9Mg0.1Cu1Se1O1' 'Bi0.875Mg0.125Cu1Se1O1'
 'Bi0.850Mg0.150Cu1Se1O1' 'Bi0.825Mg0.175Cu1Se1O1'
 'Bi0.800Mg0.200Cu1Se1O1' 'Bi0.98Na0.02Cu1Se1O1' 'Bi0.96Na0.04Cu1Se1O1'
 'Bi0.94Na0.06Cu1Se1O1' 'Bi0.92Na0.08Cu1Se1O1' 'Bi0.90Na0.10Cu1Se1O1'
 'Bi0.85Na0.15Cu1Se1O1' 'Bi0.80Na0.20Cu1Se1O1' 'Bi0.98K0.02Cu1Se1O1'
 'Bi0.96K0.04Cu1Se1O1' 'Bi0.94K0.06Cu1Se1O1' 'Bi0.92K0.08Cu1Se1O1'
 'Bi0.90K0.10Cu1Se1O1' 'Bi0.970In0.030C

In [23]:
# Set a random seed to ensure reproducibility across runs
np.random.seed(seed=RNG_SEED)

# Store a list of all unique formulae
all_formulae = unique_formulae.copy()

# Define the proportional size of the dataset split

test_size = 0.15




train_size = 1 - test_size

# Calculate the number of samples in each dataset split

num_test_samples = int(round(test_size * len(unique_formulae)))
num_train_samples = int(round((1 - test_size) * len(unique_formulae)))

# Randomly choose the formulate for the test dataset, and remove those from the unique formulae list
test_formulae = np.random.choice(all_formulae, size=num_test_samples, replace=False)
all_formulae = [f for f in all_formulae if f not in test_formulae]

# The remaining formulae will be used for the training dataset
train_formulae = all_formulae.copy()

print('Number of training formulae:', len(train_formulae))
print('Number of testing formulae:', len(test_formulae))

Number of training formulae: 112
Number of testing formulae: 20


In [24]:
# Split the original dataset into the train/validation/test datasets using the formulae lists above
df_train = df[df['formula'].isin(train_formulae)]

df_test = df[df['formula'].isin(test_formulae)]

print(f'train dataset shape: {df_train.shape}')

print(f'test dataset shape: {df_test.shape}\n')

print(df_train.head(), '\n')

print(df_test.head(), '\n')

train dataset shape: (127, 5)
test dataset shape: (25, 5)

                formula         k         c         s        zt
0           Bi1Cu1Se1O1  1.000000  1.000000  1.000000  1.000000
1  Bi0.98Sn0.02Cu1Se1O1  1.185518  0.914338  0.992608  0.759896
2  Bi0.96Sn0.04Cu1Se1O1  1.389035  0.955253  1.025874  0.723758
3  Bi0.94Sn0.06Cu1Se1O1  1.221515  0.617039  1.170023  0.691516
5  Bi0.99Cd0.01Cu1Se1O1  0.835767  1.517739  0.946847  1.628063 

                   formula         k         c         s        zt
4     Bi0.92Sn0.08Cu1Se1O1  1.230375  0.746489  1.073924  0.699733
19    Bi0.94Pb0.06Cu1Se1O1  1.291246  6.048554  0.585423  1.605395
26  Bi0.850Mg0.150Cu1Se1O1  1.298553  3.974598  0.792604  1.922854
27  Bi0.825Mg0.175Cu1Se1O1  1.403130  4.298077  0.738442  1.670355
31    Bi0.94Na0.06Cu1Se1O1  0.759008  1.388908  0.911638  1.520800 



In [25]:
# saving these splits into csv files
PATH = os.getcwd()

train_path = os.path.join(PATH, 'zt_train.csv')

test_path = os.path.join(PATH, 'zt_test.csv')

df_train.to_csv(train_path, index=False)

df_test.to_csv(test_path, index=False)

In [21]:
from sklearn.neighbors import KNeighborsClassifier as Knn
from sklearn.model_selection import cross_val_score as crovasco
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize

In [22]:
from sklearn import preprocessing
lab_enc = preprocessing.LabelEncoder()
y_train_encoded = lab_enc.fit_transform(y_train)