In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from lazypredict.Supervised import LazyClassifier, LazyRegressor
from sklearn.metrics import *
from mendeleev import element
pd.set_option('display.max_columns', None)

In [None]:
df1 = pd.read_csv('llzo_dataset_clean-4.csv')

In [None]:
df1 = df1.dropna(axis=0, how='all')

In [None]:
df = df1.drop(['source', 'conductivity', 'log_cond'], axis=1).copy()

In [None]:
df = df.drop_duplicates(keep='first')

In [None]:
[df[x].fillna('none', inplace=True) for x in ['li_dopant', 'la_dopant', 'zr_dopant']]

[None, None, None]

In [None]:
df['li_dopant'] = df['li_dopant'].apply(lambda x: x[0].upper() + x[1:])
df['la_dopant'] = df['la_dopant'].apply(lambda x: x[0].upper() + x[1:])
df['zr_dopant'] = df['zr_dopant'].apply(lambda x: x[0].upper() + x[1:])

In [None]:
df['li_dopant'] = df['li_dopant'].apply(lambda x: element(x).atomic_number if x!="None" else 0)
df['la_dopant'] = df['la_dopant'].apply(lambda x: element(x).atomic_number if x!="None" else 0)
df['zr_dopant'] = df['zr_dopant'].apply(lambda x: element(x).atomic_number if x!="None" else 0)

In [None]:
df_ionic, df_relden, df_both = df.drop(['rel_dens_%'], axis=1).copy(), \
                               df.drop(['good_cond'], axis=1).dropna(subset=['rel_dens_%']).copy(), \
                               df.dropna(subset=['rel_dens_%']).copy()

In [None]:
df_ionic, df_relden, df_both = (x.reset_index(drop=True) for x in [df_ionic, df_relden, df_both])

In [None]:
scaler = StandardScaler()

## Predicting Ionic Conductivity

In [None]:
# uncomment to include relative density
# df_ionic = df.dropna(subset=['rel_dens_%'])

In [None]:
# cols_to_scale = ['li_sto', 'la_sto', 'zr_sto', 'li_dop_sto', 'la_dop_sto', 'zr_dop_sto', 'li_dopant', 'la_dopant', 'zr_dopant',
#                      'li_dop_ionicrad', 'la_dop_ionicrad',
#                      'zr_dop_ionicrad', 'li_dop_enev', 'la_dop_enev', 'zr_dop_enev']

In [None]:
# data_train[cols_to_scale] = scaler.fit_transform(data_train[cols_to_scale])
# data_val[cols_to_scale] = scaler.transform(data_val[cols_to_scale])

In [None]:
data_train, data_val = train_test_split(df_ionic, random_state=42)

In [None]:
X_train, y_train, X_val, y_val = data_train.drop(['good_cond'], axis=1), data_train['good_cond'], \
                                 data_val.drop(['good_cond'], axis=1), data_val['good_cond']

In [None]:
# correlations = df_ionic.corr()[['good_cond']].sort_values(by='good_cond', ascending=False)
# plt.figure(figsize=(5,8))
# sns.heatmap(correlations, cmap='coolwarm', annot = correlations)

In [None]:
# models,predictions = clf.fit(X_train, X_val, y_train, y_val)

In [None]:
# print(models)

In [None]:
clf = LazyClassifier(verbose=False, ignore_warnings=True, custom_metric = None)
X_train, y_train, X_val, y_val = data_train.drop(['good_cond'], axis=1), data_train['good_cond'], data_val.drop(['good_cond'], axis=1), data_val['good_cond']
m, p = clf.fit(X_train, X_val, y_train, y_val)

100%|█████████████████████████████████████████████████████████████████████████████████| 29/29 [00:01<00:00, 15.12it/s]

[LightGBM] [Info] Number of positive: 63, number of negative: 67
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 139
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.484615 -> initscore=-0.061558
[LightGBM] [Info] Start training from score -0.061558





In [None]:
print(p, m)

                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
LabelPropagation                   0.73               0.72     0.72      0.72   
LabelSpreading                     0.73               0.72     0.72      0.72   
KNeighborsClassifier               0.70               0.70     0.70      0.70   
AdaBoostClassifier                 0.68               0.68     0.68      0.68   
ExtraTreesClassifier               0.68               0.68     0.68      0.68   
DecisionTreeClassifier             0.66               0.66     0.66      0.66   
ExtraTreeClassifier                0.66               0.66     0.66      0.66   
RandomForestClassifier             0.64               0.64     0.64      0.64   
LogisticRegression                 0.64               0.64     0.64      0.64   
CalibratedClassifierCV             0.64               0.64     0.64      0.64   
Perceptron                  

## Summary of Results

with time only: 0.75  
with temp only: 0.73  
with both temp and time: 0.73  
with neither temp nor time: 0.70  
  
Adding time only is best. Temperature doesn't affect as much as time does. Adding both together inhibits rather than enhancing

## Testing With New Dataset

In [None]:
def get_enev(x):
    try:
        if x=='Eu':
            return 1.2
        x = element(x)
        return x.en_pauling
    except Exception as e:
        return 0

In [None]:
def get_irad(x):
    try:
        x = element(x)
        radii = [y.ionic_radius for y in x.ionic_radii]
        return sum(radii)/len(radii)
    except Exception as e:
        return 0

In [None]:
def get_anum(x):
    try:
        if x!="None":
            return element(x).atomic_number
        else:
            return 0
    except:
        print(x)

In [None]:
df_test = pd.read_csv('llzo_testing.csv').dropna(how='all')

In [None]:
df_test[['li_dopant', 'la_dopant', 'zr_dopant']] = df_test[['li_dopant', 'la_dopant', 'zr_dopant']].fillna("None").astype('str')
df_test['li_dopant'] = df_test['li_dopant'].apply(lambda x: x[0].upper() + x[1:])
df_test['la_dopant'] = df_test['la_dopant'].apply(lambda x: x[0].upper() + x[1:])
df_test['zr_dopant'] = df_test['zr_dopant'].apply(lambda x: x[0].upper() + x[1:])

In [None]:
df_test['li_dop_enev'] = df_test['li_dopant'].apply(lambda x: get_enev(x))
df_test['la_dop_enev'] = df_test['la_dopant'].apply(lambda x: get_enev(x))
df_test['zr_dop_enev'] = df_test['zr_dopant'].apply(lambda x: get_enev(x))

In [None]:
df_test['li_dop_ionicrad'] = df_test['li_dopant'].apply(lambda x: get_irad(x))
df_test['la_dop_ionicrad'] = df_test['la_dopant'].apply(lambda x: get_irad(x))
df_test['zr_dop_ionicrad'] = df_test['zr_dopant'].apply(lambda x: get_irad(x))

In [None]:
df_test['li_dopant'] = df_test['li_dopant'].apply(lambda x: get_anum(x))
df_test['la_dopant'] = df_test['la_dopant'].apply(lambda x: get_anum(x))
df_test['zr_dopant'] = df_test['zr_dopant'].apply(lambda x: get_anum(x))

In [None]:
df_test = df_test.drop(['source', 'rel_dens_%', 'conductivity', 'log_cond'], axis=1)

In [None]:
# using best classifier
clf = sklearn.semi_supervised.LabelPropagation()
X_train, y_train = data_train_withtime.drop(['good_cond'], axis=1), data_train_withtime['good_cond']
clf.fit(X_train, y_train)

In [None]:
preds = clf.predict(df_test.drop(['good_cond', 'sintering_temp'], axis=1))

In [None]:
df_test['predicted_cond'] = preds

In [None]:
print(classification_report(df_test['good_cond'], df_test['predicted_cond']))

              precision    recall  f1-score   support

         0.0       0.58      0.93      0.72        15
         1.0       0.50      0.09      0.15        11

    accuracy                           0.58        26
   macro avg       0.54      0.51      0.44        26
weighted avg       0.55      0.58      0.48        26



In [None]:
df_test

Unnamed: 0,li_sto,la_sto,zr_sto,sintering_temp,sintering_time,li_dop_sto,la_dop_sto,zr_dop_sto,good_cond,li_dopant,la_dopant,zr_dopant,li_dop_ionicrad,la_dop_ionicrad,zr_dop_ionicrad,li_dop_enev,la_dop_enev,zr_dop_enev,predicted_cond
0,6.4,3.0,1.4,1050.0,6.0,0.0,0.0,0.6,0.0,0,0,73,0.0,0,69.4,0.0,0,1.5,0
1,6.4,3.0,1.6,1050.0,6.0,0.07,0.0,0.4,0.0,31,0,73,54.67,0,69.4,1.81,0,1.5,0
2,6.4,3.0,2.0,1050.0,6.0,0.13,0.0,0.2,0.0,31,0,73,54.67,0,69.4,1.81,0,1.5,0
3,6.4,3.0,1.6,1050.0,6.0,0.2,0.0,0.0,0.0,31,0,0,54.67,0,0.0,1.81,0,0.0,0
4,6.4,3.0,1.4,1050.0,12.0,0.0,0.0,0.6,0.0,0,0,73,0.0,0,69.4,0.0,0,1.5,1
5,6.4,3.0,1.6,1050.0,12.0,0.07,0.0,0.4,1.0,31,0,73,54.67,0,69.4,1.81,0,1.5,0
6,6.4,3.0,2.0,1050.0,12.0,0.13,0.0,0.2,1.0,31,0,73,54.67,0,69.4,1.81,0,1.5,0
7,6.4,3.0,1.6,1050.0,12.0,0.2,0.0,0.0,1.0,31,0,0,54.67,0,0.0,1.81,0,0.0,0
8,6.4,3.0,1.4,1100.0,6.0,0.0,0.0,0.6,0.0,0,0,73,0.0,0,69.4,0.0,0,1.5,0
9,6.4,3.0,1.6,1100.0,6.0,0.07,0.0,0.4,0.0,31,0,73,54.67,0,69.4,1.81,0,1.5,0


In [None]:
df_test.to_csv('validation.csv', index=False)

In [None]:
df_new = [
    {
        'li_sto': 6.95,
        'la_sto': 3,
        'zr_sto': 1.9,
        'sintering_temp': 1200,
        'sintering_time': 6,
        'li_dop_sto': 0.05,
        'la_dop_sto': 0,
        'zr_dop_sto': 0.1,
        'li_dopant': 'Ta',
        'la_dopant': 'None',
        'zr_dopant': 'Ge',
    },
    {
        'li_sto': 6.6,
        'la_sto': 3,
        'zr_sto': 2,
        'sintering_temp': 1300,
        'sintering_time': 12,
        'li_dop_sto': 0.2,
        'la_dop_sto': 0,
        'zr_dop_sto': 0,
        'li_dopant': 'Sn',
        'la_dopant': 'None',
        'zr_dopant': 'None',
    },
    {
        'li_sto': 6.4,
        'la_sto': 3,
        'zr_sto': 1.4,
        'sintering_temp': 1100,
        'sintering_time': 24,
        'li_dop_sto': 0.13,
        'la_dop_sto': 0,
        'zr_dop_sto': 0.6,
        'li_dopant': 'Ge',
        'la_dopant': 'None',
        'zr_dopant': 'Mn',
    }
]
df_new = pd.DataFrame(df_new)

In [None]:
df_new['li_dop_enev'] = df_new['li_dopant'].apply(lambda x: get_enev(x))
df_new['la_dop_enev'] = df_new['la_dopant'].apply(lambda x: get_enev(x))
df_new['zr_dop_enev'] = df_new['zr_dopant'].apply(lambda x: get_enev(x))

In [None]:
df_new['li_dop_ionicrad'] = df_new['li_dopant'].apply(lambda x: get_irad(x))
df_new['la_dop_ionicrad'] = df_new['la_dopant'].apply(lambda x: get_irad(x))
df_new['zr_dop_ionicrad'] = df_new['zr_dopant'].apply(lambda x: get_irad(x))

In [None]:
df_new['li_dopant'] = df_new['li_dopant'].apply(lambda x: get_anum(x))
df_new['la_dopant'] = df_new['la_dopant'].apply(lambda x: get_anum(x))
df_new['zr_dopant'] = df_new['zr_dopant'].apply(lambda x: get_anum(x))

In [None]:
clf.predict(df_new.drop(['sintering_time'], axis=1))

array([0, 0, 0], dtype=int64)