In [1]:
import pandas as pd 
import numpy as np 
import sklearn 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler 
from sklearn.svm import SVR

import matplotlib.pyplot as plt

# 1.Data Preprocessing
## 1.1 separating data 

In [2]:
# get data from csv file 
pd_data = pd.read_csv('independent_chalcogenides.csv', sep=',')

In [3]:
pd_data

Unnamed: 0,Compound,OSA,OSB,OSC,EA,EB,EC,CAN,CNB,CNC,IRA,IRB,IRC,DA,DB,DC,BANDGAP
0,BaTiO3,2,4,-2,0.89,1.54,3.44,12,6,2,1.420000e-10,6.100000e-11,1.400000e-10,3.51,4.50,0.001429,2.14
1,BaZrO3,2,4,-2,0.89,1.33,3.44,12,6,2,1.420000e-10,8.400000e-11,1.400000e-10,3.51,6.40,0.001429,3.04
2,BaZrS3,2,4,-2,0.89,1.33,2.58,12,6,2,1.420000e-10,8.400000e-11,1.840000e-10,3.51,6.40,2.070000,1.02
3,CuTaS3,1,5,-2,1.90,1.50,2.58,6,2,2,7.300000e-11,6.400000e-11,1.840000e-10,8.96,16.60,2.070000,0.98
4,CsNbSe3,1,5,-2,0.79,1.60,2.55,12,6,2,1.810000e-10,6.400000e-11,5.000000e-11,1.87,8.57,4.790000,1.47
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113,AgHfS3,1,4,-2,1.90,1.30,2.58,8,6,2,1.150000e-10,8.300000e-11,1.840000e-10,10.50,13.20,2.070000,2.30
114,AgISe3,1,5,-2,1.90,2.20,2.50,8,6,3,1.150000e-10,2.200000e-10,5.000000e-11,10.50,4.93,4.970000,1.40
115,AgITe3,1,5,-2,1.90,2.20,2.10,8,6,4,1.150000e-10,2.200000e-10,9.700000e-11,10.50,4.93,6.240000,1.00
116,AgInTe3,1,3,-2,1.90,1.78,2.10,8,6,4,1.150000e-10,1.040000e-10,9.700000e-11,10.50,7.30,6.240000,1.70


In [4]:
# change comma to dot 
for col in ['OSA','OSB', 'OSC','EA', 'EB', 'EC','CAN','CNB','CNC', 'IRA', 'IRB', 'IRC','DA','DB','DC','BANDGAP']:
    pd_data[col] = pd_data[col].astype(str).str.replace(',', '.').astype(float)

In [5]:
pd_data

Unnamed: 0,Compound,OSA,OSB,OSC,EA,EB,EC,CAN,CNB,CNC,IRA,IRB,IRC,DA,DB,DC,BANDGAP
0,BaTiO3,2.0,4.0,-2.0,0.89,1.54,3.44,12.0,6.0,2.0,1.420000e-10,6.100000e-11,1.400000e-10,3.51,4.50,0.001429,2.14
1,BaZrO3,2.0,4.0,-2.0,0.89,1.33,3.44,12.0,6.0,2.0,1.420000e-10,8.400000e-11,1.400000e-10,3.51,6.40,0.001429,3.04
2,BaZrS3,2.0,4.0,-2.0,0.89,1.33,2.58,12.0,6.0,2.0,1.420000e-10,8.400000e-11,1.840000e-10,3.51,6.40,2.070000,1.02
3,CuTaS3,1.0,5.0,-2.0,1.90,1.50,2.58,6.0,2.0,2.0,7.300000e-11,6.400000e-11,1.840000e-10,8.96,16.60,2.070000,0.98
4,CsNbSe3,1.0,5.0,-2.0,0.79,1.60,2.55,12.0,6.0,2.0,1.810000e-10,6.400000e-11,5.000000e-11,1.87,8.57,4.790000,1.47
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113,AgHfS3,1.0,4.0,-2.0,1.90,1.30,2.58,8.0,6.0,2.0,1.150000e-10,8.300000e-11,1.840000e-10,10.50,13.20,2.070000,2.30
114,AgISe3,1.0,5.0,-2.0,1.90,2.20,2.50,8.0,6.0,3.0,1.150000e-10,2.200000e-10,5.000000e-11,10.50,4.93,4.970000,1.40
115,AgITe3,1.0,5.0,-2.0,1.90,2.20,2.10,8.0,6.0,4.0,1.150000e-10,2.200000e-10,9.700000e-11,10.50,4.93,6.240000,1.00
116,AgInTe3,1.0,3.0,-2.0,1.90,1.78,2.10,8.0,6.0,4.0,1.150000e-10,1.040000e-10,9.700000e-11,10.50,7.30,6.240000,1.70


In [6]:
pd_os = pd_data.iloc[:, 1:4]
pd_e = pd_data.iloc[:, 4:7]
pd_cn = pd_data.iloc[:,7:10]
pd_ir = pd_data.iloc[:,10:13]
pd_d = pd_data.iloc[:,13:16]

In [7]:
pd_os

Unnamed: 0,OSA,OSB,OSC
0,2.0,4.0,-2.0
1,2.0,4.0,-2.0
2,2.0,4.0,-2.0
3,1.0,5.0,-2.0
4,1.0,5.0,-2.0
...,...,...,...
113,1.0,4.0,-2.0
114,1.0,5.0,-2.0
115,1.0,5.0,-2.0
116,1.0,3.0,-2.0


In [8]:
pd_e

Unnamed: 0,EA,EB,EC
0,0.89,1.54,3.44
1,0.89,1.33,3.44
2,0.89,1.33,2.58
3,1.90,1.50,2.58
4,0.79,1.60,2.55
...,...,...,...
113,1.90,1.30,2.58
114,1.90,2.20,2.50
115,1.90,2.20,2.10
116,1.90,1.78,2.10


In [9]:
pd_cn

Unnamed: 0,CAN,CNB,CNC
0,12.0,6.0,2.0
1,12.0,6.0,2.0
2,12.0,6.0,2.0
3,6.0,2.0,2.0
4,12.0,6.0,2.0
...,...,...,...
113,8.0,6.0,2.0
114,8.0,6.0,3.0
115,8.0,6.0,4.0
116,8.0,6.0,4.0


In [10]:
pd_ir

Unnamed: 0,IRA,IRB,IRC
0,1.420000e-10,6.100000e-11,1.400000e-10
1,1.420000e-10,8.400000e-11,1.400000e-10
2,1.420000e-10,8.400000e-11,1.840000e-10
3,7.300000e-11,6.400000e-11,1.840000e-10
4,1.810000e-10,6.400000e-11,5.000000e-11
...,...,...,...
113,1.150000e-10,8.300000e-11,1.840000e-10
114,1.150000e-10,2.200000e-10,5.000000e-11
115,1.150000e-10,2.200000e-10,9.700000e-11
116,1.150000e-10,1.040000e-10,9.700000e-11


In [11]:
pd_d

Unnamed: 0,DA,DB,DC
0,3.51,4.50,0.001429
1,3.51,6.40,0.001429
2,3.51,6.40,2.070000
3,8.96,16.60,2.070000
4,1.87,8.57,4.790000
...,...,...,...
113,10.50,13.20,2.070000
114,10.50,4.93,4.970000
115,10.50,4.93,6.240000
116,10.50,7.30,6.240000


In [12]:
pd_label = pd_data['BANDGAP']

In [13]:
pd_label 

0      2.14
1      3.04
2      1.02
3      0.98
4      1.47
       ... 
113    2.30
114    1.40
115    1.00
116    1.70
117    1.32
Name: BANDGAP, Length: 118, dtype: float64

## 1.2 standarizing data every group by scaler 

In [14]:
# use min max scaler 
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
os_scaled = scaler.fit_transform(pd_os)
e_scaled = scaler.fit_transform(pd_e)
cn_scaled =scaler.fit_transform(pd_cn)
ir_scaled =scaler.fit_transform(pd_ir)
d_scaled =scaler.fit_transform(pd_d)

In [15]:
os_scaled 

array([[0.25, 0.6 , 0.  ],
       [0.25, 0.6 , 0.  ],
       [0.25, 0.6 , 0.  ],
       [0.  , 0.8 , 0.  ],
       [0.  , 0.8 , 0.  ],
       [0.  , 0.8 , 0.  ],
       [0.  , 0.8 , 0.  ],
       [0.25, 0.8 , 0.  ],
       [0.25, 0.6 , 0.  ],
       [0.25, 0.6 , 0.  ],
       [1.  , 0.6 , 0.  ],
       [0.5 , 0.6 , 0.  ],
       [0.5 , 0.4 , 0.  ],
       [0.5 , 0.4 , 0.  ],
       [0.5 , 0.6 , 0.  ],
       [1.  , 0.6 , 0.  ],
       [0.5 , 0.4 , 0.  ],
       [1.  , 0.4 , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [0.  , 0.8 , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.4 , 0.  ],
       [1.  , 0.  , 0.  ],
       [0.75, 0.2 , 0.  ],
       [0.5 , 0.4 , 0.  ],
       [0.5 , 0.4 , 0.  ],
       [1.  , 0.4 , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.4 , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
 

In [16]:
e_scaled 

array([[0.06493506, 0.33333333, 1.        ],
       [0.06493506, 0.20833333, 1.        ],
       [0.06493506, 0.20833333, 0.35820896],
       [0.72077922, 0.30952381, 0.35820896],
       [0.        , 0.36904762, 0.3358209 ],
       [0.        , 0.36904762, 0.35820896],
       [0.        , 0.30952381, 1.        ],
       [0.13636364, 0.20833333, 1.        ],
       [0.13636364, 0.33333333, 1.        ],
       [0.13636364, 0.20833333, 1.        ],
       [0.9025974 , 0.54761905, 0.3358209 ],
       [0.72077922, 0.54761905, 0.35820896],
       [0.64285714, 0.49404762, 0.35820896],
       [0.64285714, 0.61309524, 0.3358209 ],
       [0.64285714, 0.54761905, 0.3358209 ],
       [0.81818182, 0.54761905, 0.3358209 ],
       [0.9025974 , 0.47619048, 0.35820896],
       [0.90909091, 0.47619048, 0.3358209 ],
       [0.9025974 , 0.375     , 0.3358209 ],
       [0.9025974 , 0.54761905, 0.3358209 ],
       [0.9025974 , 0.54761905, 0.35820896],
       [0.9025974 , 0.7202381 , 0.3358209 ],
       [0.

In [17]:
cn_scaled 

array([[1.        , 0.66666667, 0.        ],
       [1.        , 0.66666667, 0.        ],
       [1.        , 0.66666667, 0.        ],
       [0.33333333, 0.        , 0.        ],
       [1.        , 0.66666667, 0.        ],
       [1.        , 0.66666667, 0.        ],
       [1.        , 0.66666667, 0.        ],
       [1.        , 0.66666667, 0.        ],
       [1.        , 0.66666667, 0.        ],
       [1.        , 0.66666667, 0.        ],
       [0.        , 0.33333333, 0.        ],
       [0.33333333, 0.33333333, 0.        ],
       [0.        , 0.33333333, 0.        ],
       [0.33333333, 0.33333333, 0.        ],
       [0.33333333, 0.33333333, 0.        ],
       [0.        , 0.33333333, 0.        ],
       [0.33333333, 0.33333333, 0.        ],
       [0.11111111, 0.66666667, 0.        ],
       [0.        , 0.66666667, 0.        ],
       [0.        , 0.66666667, 0.        ],
       [0.        , 0.66666667, 0.        ],
       [0.        , 0.33333333, 0.        ],
       [0.

In [18]:
ir_scaled 

array([[0.76219512, 0.22058824, 0.67164179],
       [0.76219512, 0.33333333, 0.67164179],
       [0.76219512, 0.33333333, 1.        ],
       [0.34146341, 0.23529412, 1.        ],
       [1.        , 0.23529412, 0.        ],
       [0.95731707, 0.23529412, 1.        ],
       [0.95731707, 0.23529412, 0.67164179],
       [0.50609756, 0.33333333, 0.67164179],
       [0.66463415, 0.22058824, 0.67164179],
       [0.66463415, 0.33333333, 0.67164179],
       [0.25      , 0.04901961, 0.        ],
       [0.52439024, 0.04901961, 1.        ],
       [0.38414634, 0.2254902 , 1.        ],
       [0.38414634, 0.18137255, 0.        ],
       [0.38414634, 0.04901961, 0.        ],
       [0.3597561 , 0.04901961, 0.        ],
       [0.25      , 0.43137255, 1.        ],
       [0.        , 0.43137255, 0.        ],
       [0.25      , 0.18627451, 0.        ],
       [0.25      , 0.42647059, 0.        ],
       [0.18597561, 0.42647059, 1.        ],
       [0.18597561, 0.00490196, 0.        ],
       [0.

In [19]:
d_scaled 

array([[0.16245441, 0.21611323, 0.        ],
       [0.16245441, 0.31954273, 0.        ],
       [0.16245441, 0.31954273, 0.3315777 ],
       [0.51116514, 0.87479586, 0.3315777 ],
       [0.05752127, 0.43767011, 0.76757498],
       [0.05752127, 0.43767011, 0.3315777 ],
       [0.05752127, 0.87479586, 0.        ],
       [0.03704652, 0.31954273, 0.        ],
       [0.10422932, 0.21611323, 0.        ],
       [0.10422932, 0.31954273, 0.        ],
       [0.30385821, 0.09635275, 0.76757498],
       [0.56491138, 0.09635275, 0.3315777 ],
       [0.40559217, 0.2927055 , 0.3315777 ],
       [0.40559217, 0.26091453, 0.76757498],
       [0.40559217, 0.09635275, 0.76757498],
       [0.36553842, 0.09635275, 0.76757498],
       [0.30385821, 0.36853566, 0.3315777 ],
       [0.05432209, 0.36853566, 0.76757498],
       [0.30385821, 0.11823625, 0.76757498],
       [0.30385821, 0.50462711, 0.76757498],
       [0.30385821, 0.50462711, 0.3315777 ],
       [0.30385821, 0.07022319, 0.76757498],
       [0.

## 1.3 combining data every group 

In [20]:
total_scaled =np.concatenate((os_scaled, e_scaled,cn_scaled, ir_scaled, d_scaled), axis=1)

In [21]:
# jumlah total data
len(total_scaled)

118

In [22]:
# jumlah total feature 
len(total_scaled[0])

15

## 1.4 feature selection

In [23]:
# cek monotonic data using variance threshold
from sklearn.feature_selection import VarianceThreshold

selector = VarianceThreshold(threshold=0.0)
X_selected = selector.fit_transform(total_scaled)

In [24]:
len(X_selected[0])

14

In [25]:
selector.get_support() # yang dibuang data di colom no 3 OSC

array([ True,  True, False,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True])

## 1.5 finding most important feature 

In [26]:
# reduce the total scaled feature by using only the best feature 
from sklearn.feature_selection import SelectKBest, f_regression

y = np.array(pd_label)

In [27]:
X = pd.DataFrame(X_selected)
X_selected.shape

(118, 14)

In [28]:
selector = SelectKBest(score_func=f_regression, k=8)  # or k=5 to keep top 5
selector.fit(X_selected, y)


# View feature scores
for name, score in zip(X.columns, selector.scores_):
    print(f"{name}: F-score = {score:.4f}")

0: F-score = 12.6072
1: F-score = 8.5496
2: F-score = 13.5502
3: F-score = 0.1288
4: F-score = 54.7799
5: F-score = 33.7341
6: F-score = 0.4569
7: F-score = 2.5018
8: F-score = 7.9438
9: F-score = 5.5197
10: F-score = 6.3011
11: F-score = 4.3591
12: F-score = 1.0354
13: F-score = 52.1498


In [29]:
selected_features = X.columns[selector.get_support()]

In [30]:
selected_features

Index([0, 1, 2, 4, 5, 8, 10, 13], dtype='int64')

In [31]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0.25,0.6,0.064935,0.333333,1.000000,1.000000,0.666667,0.000000,0.762195,0.220588,0.671642,0.162454,0.216113,0.000000
1,0.25,0.6,0.064935,0.208333,1.000000,1.000000,0.666667,0.000000,0.762195,0.333333,0.671642,0.162454,0.319543,0.000000
2,0.25,0.6,0.064935,0.208333,0.358209,1.000000,0.666667,0.000000,0.762195,0.333333,1.000000,0.162454,0.319543,0.331578
3,0.00,0.8,0.720779,0.309524,0.358209,0.333333,0.000000,0.000000,0.341463,0.235294,1.000000,0.511165,0.874796,0.331578
4,0.00,0.8,0.000000,0.369048,0.335821,1.000000,0.666667,0.000000,1.000000,0.235294,0.000000,0.057521,0.437670,0.767575
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113,0.00,0.6,0.720779,0.190476,0.358209,0.555556,0.666667,0.000000,0.597561,0.328431,1.000000,0.609700,0.689711,0.331578
114,0.00,0.8,0.720779,0.726190,0.298507,0.555556,0.666667,0.166667,0.597561,1.000000,0.000000,0.609700,0.239521,0.796428
115,0.00,0.8,0.720779,0.726190,0.000000,0.555556,0.666667,0.333333,0.597561,1.000000,0.350746,0.609700,0.239521,1.000000
116,0.00,0.4,0.720779,0.476190,0.000000,0.555556,0.666667,0.333333,0.597561,0.431373,0.350746,0.609700,0.368536,1.000000


In [32]:
data_new = X.drop([3,6,7,9,11,12], axis=1)
data_new

Unnamed: 0,0,1,2,4,5,8,10,13
0,0.25,0.6,0.064935,1.000000,1.000000,0.762195,0.671642,0.000000
1,0.25,0.6,0.064935,1.000000,1.000000,0.762195,0.671642,0.000000
2,0.25,0.6,0.064935,0.358209,1.000000,0.762195,1.000000,0.331578
3,0.00,0.8,0.720779,0.358209,0.333333,0.341463,1.000000,0.331578
4,0.00,0.8,0.000000,0.335821,1.000000,1.000000,0.000000,0.767575
...,...,...,...,...,...,...,...,...
113,0.00,0.6,0.720779,0.358209,0.555556,0.597561,1.000000,0.331578
114,0.00,0.8,0.720779,0.298507,0.555556,0.597561,0.000000,0.796428
115,0.00,0.8,0.720779,0.000000,0.555556,0.597561,0.350746,1.000000
116,0.00,0.4,0.720779,0.000000,0.555556,0.597561,0.350746,1.000000


## 1.6 saving the selected feature in CSV 

In [33]:
# convert to dataframe again and save the data training in csv file 
data_feature = np.array(data_new)
data_label   = y.reshape(118, 1) 

data = np.concatenate((data_feature,data_label),axis=1)
data

array([[0.25      , 0.6       , 0.06493506, ..., 0.67164179, 0.        ,
        2.14      ],
       [0.25      , 0.6       , 0.06493506, ..., 0.67164179, 0.        ,
        3.04      ],
       [0.25      , 0.6       , 0.06493506, ..., 1.        , 0.3315777 ,
        1.02      ],
       ...,
       [0.        , 0.8       , 0.72077922, ..., 0.35074627, 1.        ,
        1.        ],
       [0.        , 0.4       , 0.72077922, ..., 0.35074627, 1.        ,
        1.7       ],
       [0.        , 0.8       , 0.72077922, ..., 1.        , 0.3315777 ,
        1.32      ]])

In [34]:
data_feature.shape

(118, 8)

In [35]:
data_label.shape

(118, 1)

In [36]:
df = pd.DataFrame(data, columns=['OSA', 'OSB','EA','EC','CNA','IRA','IRC','DC','BANDGAP'])
df

Unnamed: 0,OSA,OSB,EA,EC,CNA,IRA,IRC,DC,BANDGAP
0,0.25,0.6,0.064935,1.000000,1.000000,0.762195,0.671642,0.000000,2.14
1,0.25,0.6,0.064935,1.000000,1.000000,0.762195,0.671642,0.000000,3.04
2,0.25,0.6,0.064935,0.358209,1.000000,0.762195,1.000000,0.331578,1.02
3,0.00,0.8,0.720779,0.358209,0.333333,0.341463,1.000000,0.331578,0.98
4,0.00,0.8,0.000000,0.335821,1.000000,1.000000,0.000000,0.767575,1.47
...,...,...,...,...,...,...,...,...,...
113,0.00,0.6,0.720779,0.358209,0.555556,0.597561,1.000000,0.331578,2.30
114,0.00,0.8,0.720779,0.298507,0.555556,0.597561,0.000000,0.796428,1.40
115,0.00,0.8,0.720779,0.000000,0.555556,0.597561,0.350746,1.000000,1.00
116,0.00,0.4,0.720779,0.000000,0.555556,0.597561,0.350746,1.000000,1.70


In [37]:
df.to_csv('independent_dataset_baru.csv', sep=';', index=False, encoding='utf-8')
