In [5]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split

In [6]:
data = pd.read_csv('../_data/aflow_cleaned.csv')
data = data.drop(['Unnamed: 0', 'species', 'cif_id'], axis=1)
data

Unnamed: 0,volume_atom,volume_cell,density,nspecies,stoichiometry,natoms,enthalpy_atom,electronegativity,group_numbers,Egap,Egap_type
0,34.7942,278.3530,5.56878,2,"[0.25, 0.75]",8,-4.35149,2.03,"[3, 17]",4.1084,insulator-indirect
1,29.7161,237.7290,4.65745,2,"[0.25, 0.75]",8,-4.89689,2.13,"[3, 17]",5.0778,insulator-indirect
2,17.0642,51.1926,7.42958,2,"[0.333333333, 0.666666667]",3,-4.18878,1.65,"[3, 1]",0.0000,metal
3,45.4063,45.4063,8.30264,1,[1],1,-4.09387,1.10,[3],0.0000,metal
4,18.5582,92.7911,8.98457,2,"[0.4, 0.6]",5,-7.94305,2.27,"[3, 16]",3.5216,insulator-indirect
...,...,...,...,...,...,...,...,...,...,...,...
59983,23.4196,46.8392,6.46785,1,[1],2,-8.54349,1.33,[4],0.0000,metal
59984,22.8466,22.8466,6.63007,1,[1],1,-8.46250,1.33,[4],0.0000,metal
59985,23.4185,46.8370,6.46815,1,[1],2,-8.54349,1.33,[4],0.0000,metal
59986,23.4194,46.8388,6.46791,1,[1],2,-8.54349,1.33,[4],0.0000,metal


In [7]:
ordinal_encoder = OrdinalEncoder()
standard_scaler = StandardScaler()
minmax_scaler = MinMaxScaler()

cols_to_transform = ['volume_atom', 'volume_cell', 'density', 'nspecies', 'stoichiometry', 'natoms', 'enthalpy_atom', 'electronegativity', 'group_numbers']

# Oridinal encoding for categorical variables i.e. stoichiometry and group_numbers
data['stoichiometry'] = ordinal_encoder.fit_transform(data[['stoichiometry']])
data['group_numbers'] = ordinal_encoder.fit_transform(data[['group_numbers']])

# standard scaling and normalizing for numerical variables
for col in cols_to_transform:
    data[f'{col}'] = standard_scaler.fit_transform(data[[col]])
    data[f'{col}'] = minmax_scaler.fit_transform(data[[col]])
    

# splicing bandgap type data in 3 classes
keep_classes = ['metal', 'insulator-indirect', 'insulator-direct']

data = data[data['Egap_type'].isin(keep_classes)]    

In [8]:
training_set, test_set = train_test_split(data, test_size=0.05, random_state=42)
training_set.to_csv('../_data/aflow_training_set.csv')
test_set.to_csv('../_data/aflow_test_set.csv')