In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format='retina'

from sklearn.model_selection import train_test_split

# Set a random seed to ensure reproducibility across runs
RNG_SEED = 42
np.random.seed(seed=RNG_SEED)

In [2]:
PATH = os.getcwd()
data_path = os.path.join(PATH, './zt_data_cleaned2.csv')

df = pd.read_csv(data_path)
print(f'Full DataFrame shape: {df.shape}')

Full DataFrame shape: (229, 6)


In [3]:
X = df[['formula', 'zt', 'c','s']]
y = df['k']

print(f'Shape of X: {X.shape}')
print(f'Shape of y: {y.shape}')
X.head()

Shape of X: (229, 4)
Shape of y: (229,)


Unnamed: 0,formula,zt,c,s
0,Bi1Cu1Se1O1,0.92,446.126,146.245
1,Bi0.98Sn0.02Cu1Se1O1,0.71,407.91,145.164
2,Bi0.96Sn0.04Cu1Se1O1,0.67,426.163,150.029
3,Bi0.94Sn0.06Cu1Se1O1,0.66,275.277,171.11
4,Bi0.92Sn0.08Cu1Se1O1,0.64,333.028,157.056


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50, random_state=RNG_SEED)

print(X_train.shape)
print(X_test.shape)

(114, 4)
(115, 4)


In [5]:
num_rows = len(X_train)
print(f'There are in total {num_rows} rows in the X_train DataFrame.')

num_unique_formulae = len(X_train['formula'].unique())
print(f'But there are only {num_unique_formulae} unique formulae!\n')

print('Unique formulae and their number of occurances in the X_train DataFrame:')
print(X_train['formula'].value_counts(), '\n')
print('Unique formulae and their number of occurances in the X_test DataFrame:')
print(X_test['formula'].value_counts())

There are in total 114 rows in the X_train DataFrame.
But there are only 84 unique formulae!

Unique formulae and their number of occurances in the X_train DataFrame:
Bi1Cu1Se1O1                     19
Bi0.92Pb0.08Cu1Se1O1             3
Bi0.94Pb0.06Cu1Se1O1             3
Bi0.90Na0.10Cu1Se1O1             2
Bi0.975Cu1Se1O1                  2
                                ..
Bi1Cu0.99Ag0.01Se1O1             1
Bi0.96Ag0.04Cu1Se1O1             1
Bi0.92Yb0.08Cu1Se1O1             1
B0.975Ca0.025Cu1Se1O1            1
Bi0.85Na0.15Cu1Se1O0.85F0.15     1
Name: formula, Length: 84, dtype: int64 

Unique formulae and their number of occurances in the X_test DataFrame:
Bi1Cu1Se1O1                      20
Bi0.875Mg0.125Cu1Se1O1            2
Bi0.98Na0.02Cu1Se1O1              2
Bi0.985Na0.015Cu1Se1O1            2
Bi0.96Ca0.04Cu1Se1O1              1
                                 ..
Bi0.9Li0.1Cu1Se1O1                1
Bi0.94Pb0.06Cu0.97Fe0.03Se1O1     1
Bi0.975Ba0.025Cu1Se1O1            1
Bi0.98Sb0.

In [6]:
unique_formulae = X['formula'].unique()
print(f'{len(unique_formulae)} unique formulae:\n{unique_formulae}')


171 unique formulae:
['Bi1Cu1Se1O1' 'Bi0.98Sn0.02Cu1Se1O1' 'Bi0.96Sn0.04Cu1Se1O1'
 'Bi0.94Sn0.06Cu1Se1O1' 'Bi0.92Sn0.08Cu1Se1O1' 'Bi0.985Na0.015Cu1Se1O1'
 'Bi0.945Na0.015Pb0.04Cu1Se1O1' 'Bi0.925Na0.015Pb0.06Cu1Se1O1'
 'Bi0.905Na0.015Pb0.08Cu1Se1O1' 'Bi0.99Cd0.01Cu1Se1O1'
 'Bi0.95Cd0.05Cu1Se1O1' 'Bi0.9Cd0.1Cu1Se1O1' 'Bi1Cu0.99Ag0.01Se1O1'
 'Bi1Cu0.97Ag0.03Se1O1' 'Bi0.995Sb0.005Cu1Se1O1' 'Bi0.99Sb0.01Cu1Se1O1'
 'Bi0.98Sb0.02Cu1Se1O1' 'Bi0.95Sb0.05Cu1Se1O1' 'Bi0.92Sb0.08Cu1Se1O1'
 'Bi0.99Sb0.01Cu1Se0.99Te0.01O1' 'Bi0.98Sb0.02Cu1Se0.98Te0.02O1'
 'Bi0.96Sb0.04Cu1Se0.96Te0.04O1' 'Bi0.94Sb0.06Cu1Se0.94Te0.06O1'
 'Bi0.92Sb0.08Cu1Se0.92Te0.08O1' 'Bi0.98Er0.02Cu1Se1O1'
 'Bi0.96Er0.04Cu1Se1O1' 'Bi0.94Er0.06Cu1Se1O1' 'Bi0.92Er0.08Cu1Se1O1'
 'Bi0.90Er0.10Cu1Se1O1' 'Bi0.98Pb0.02Cu0.98Ni0.02Se1O1'
 'Bi0.96Pb0.04Cu0.96Ni0.04Se1O1' 'Bi0.94Pb0.06Cu0.94Ni0.06Se1O1'
 'Bi0.92Pb0.08Cu0.92Ni0.08Se1O1' 'Bi0.90Pb0.10Cu0.90Ni0.10Se1O1'
 'Bi0.975Cu1Se1O1' 'Bi0.95Cu1Se1O1' 'Bi0.9Cu1Se1O1' 'Bi1Cu1Se0.975O1'
 'Bi1C

In [7]:
# Set a random seed to ensure reproducibility across runs
np.random.seed(seed=RNG_SEED)

# Store a list of all unique formulae
all_formulae = unique_formulae.copy()

# Define the proportional size of the dataset split

test_size = 0.11
train_size = 1 - test_size

# Calculate the number of samples in each dataset split

num_test_samples = int(round(test_size * len(unique_formulae)))
num_train_samples = int(round((1 - test_size) * len(unique_formulae)))

# Randomly choose the formulate for the test dataset, and remove those from the unique formulae list
test_formulae = np.random.choice(all_formulae, size=num_test_samples, replace=False)
all_formulae = [f for f in all_formulae if f not in test_formulae]

# The remaining formulae will be used for the training dataset
train_formulae = all_formulae.copy()

print('Number of training formulae:', len(train_formulae))
print('Number of testing formulae:', len(test_formulae))

Number of training formulae: 152
Number of testing formulae: 19


In [8]:
# Split the original dataset into the train/validation/test datasets using the formulae lists above
df_train = df[df['formula'].isin(train_formulae)]

df_test = df[df['formula'].isin(test_formulae)]

print(f'train dataset shape: {df_train.shape}')

print(f'test dataset shape: {df_test.shape}\n')

print(df_train, '\n')

print(df_test, '\n')

train dataset shape: (204, 6)
test dataset shape: (25, 6)

                  formula  Temperature    zt       k        c        s
0             Bi1Cu1Se1O1          701  0.92  0.7223  446.126  146.245
1    Bi0.98Sn0.02Cu1Se1O1          701  0.71  0.8563  407.910  145.164
2    Bi0.96Sn0.04Cu1Se1O1          701  0.67  1.0033  426.163  150.029
3    Bi0.94Sn0.06Cu1Se1O1          701  0.66  0.8823  275.277  171.110
4    Bi0.92Sn0.08Cu1Se1O1          701  0.64  0.8887  333.028  157.056
..                    ...          ...   ...     ...      ...      ...
224  Bi0.94Yb0.06Cu1Se1O1          700  0.32  0.4415   26.319  275.578
225  Bi0.92Yb0.08Cu1Se1O1          700  0.20  0.4232   10.168  350.129
226  Bi0.90Yb0.10Cu1Se1O1          700  0.19  0.4070    9.856  335.733
227  Bi0.85Yb0.15Cu1Se1O1          700  0.27  0.4486   56.124  173.265
228    Bi0.7Yb0.3Cu1Se1O1          700  0.30  0.4408   60.805  176.350

[204 rows x 6 columns] 

                               formula  Temperature    zt      

In [9]:
# saving these splits into csv files
PATH = os.getcwd()

train_path = os.path.join(PATH, 'zt_train.csv')

test_path = os.path.join(PATH, 'zt_test.csv')

df_train.to_csv(train_path, index=False)

df_test.to_csv(test_path, index=False)

In [10]:
from sklearn.neighbors import KNeighborsClassifier as Knn
from sklearn.model_selection import cross_val_score as crovasco
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize

In [11]:
from sklearn import preprocessing
lab_enc = preprocessing.LabelEncoder()
y_train_encoded = lab_enc.fit_transform(y_train)