In [2]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt

In [3]:
main_df = pd.read_csv("main_df.csv")

In [4]:
main_df.columns

Index(['Paper #', 'Device #', 'DOI', 'Manufacturer',
       'Manuscripted submitted year', 'Country', 'Efficiency (%)', 'Voc (V)',
       'Voc per cell (V)', 'Isc (mA)', 'Jsc (mA/cm^2)', 'FF (%)', 'Pmax (mW)',
       'Vmax (V)', 'Imax (mA)', 'Apeture area (cm^2)', 'Device size type',
       'Sweep direction', 'Sweep rate (mV/s)', 'Certified year',
       'Transparent conductive oxide type', 'Electron transport layer type',
       'Perovskite composition', 'Perovskite chemistry type', 'extra',
       'extra%', 'MA', 'FA', 'Pb', 'I', 'Br', 'Passive_layer',
       'Perovksite depostion method', 'Hole transport layer type',
       'Conductive electrode layer', 'Device structure', 'exposed_time',
       'retained_per', 'stability'],
      dtype='object')

In [5]:
minor_cols = ['Paper #', 'Device #', 'DOI', 'Manufacturer',
       'Manuscripted submitted year', 'Country','Sweep rate (mV/s)','Certified year', 'Passive_layer','extra','extra%','Perovskite composition', 'Perovskite chemistry type']

In [6]:
use_df = main_df.drop(minor_cols,axis=1)

In [7]:
use_df.columns

Index(['Efficiency (%)', 'Voc (V)', 'Voc per cell (V)', 'Isc (mA)',
       'Jsc (mA/cm^2)', 'FF (%)', 'Pmax (mW)', 'Vmax (V)', 'Imax (mA)',
       'Apeture area (cm^2)', 'Device size type', 'Sweep direction',
       'Transparent conductive oxide type', 'Electron transport layer type',
       'MA', 'FA', 'Pb', 'I', 'Br', 'Perovksite depostion method',
       'Hole transport layer type', 'Conductive electrode layer',
       'Device structure', 'exposed_time', 'retained_per', 'stability'],
      dtype='object')

In [8]:
def change_dtype(df):
  for col in df.columns:
    try:
      float(df[col].values[0])
      df[col] = df[col].astype(float)
    except:
      try:
        int(df[col].values[0])
        df[col] = df[col].astype(int)
      except:
        try:
          str(df[col].values[0])
          df[col] = df[col].astype(str)
        except:
          df[col] = df[col].astype(object)
  return df

In [9]:
def make_classes(mod_df):
    condition = [mod_df['Efficiency (%)'] <= 15.0,
          (mod_df['Efficiency (%)'] > 15.0) & (mod_df['Efficiency (%)'] <= 20.0),
          mod_df['Efficiency (%)'] > 20.0]
    values = ['low_efficient','moderate_efficient','high_efficient']
    mod_df['cat_efc'] = np.select(condition,values)
    condition_sta = [mod_df['stability'] <= 100.0,
          (mod_df['stability'] > 100.0) & (mod_df['stability'] <= 300.0),
          mod_df['stability'] > 300]
    values_sta = ['unstable','low_stabiity','stable']
    mod_df['cat_sta'] = np.select(condition_sta,values_sta)
    return mod_df

In [10]:
sta_vals = use_df.stability.values


In [11]:
sta_main = []
for val in sta_vals:
    try:
        k = float(val)
        sta_main.append(k)
    except:
        pass

In [12]:
temp_df = pd.DataFrame(sta_main)

temp_df.mode()

In [13]:
temp_df.median()

0    64.8
dtype: float64

In [14]:
use_df["stability"].replace("unknown",64.8,inplace=True)

In [15]:
use_df.head()

Unnamed: 0,Efficiency (%),Voc (V),Voc per cell (V),Isc (mA),Jsc (mA/cm^2),FF (%),Pmax (mW),Vmax (V),Imax (mA),Apeture area (cm^2),...,Pb,I,Br,Perovksite depostion method,Hole transport layer type,Conductive electrode layer,Device structure,exposed_time,retained_per,stability
0,14.14,1.0071,1.0071,4.46,21.339713,65.7,2.96,0.7763,3.81,0.209,...,1,3.0,0.0,spin coating,Spiro-OMeTAD,Au,Mesoscopic,500,80,40.0
1,16.15,1.1085,1.1085,1.84,19.616205,74.2,1.51,0.9052,1.67,0.0938,...,1,2.58,0.42,spin coating,PTAA,Au,Mesoscopic,unknown,unknown,64.8
2,12.84,0.8579,0.8579,1.65,22.758621,65.8,0.93,0.654,1.42,0.0725,...,1,3.0,0.0,screen printing,No HTL,carbon black/graphite,Mesoscopic,1008,100,100.8
3,20.11,1.059,1.059,2.354,24.649215,77.0,1.92,0.894,2.148,0.0955,...,1,2.85,0.15,spin coating,PTAA,Au,Mesoscopic,unkown,unknown,64.8
4,15.0,1.09,1.09,20.96,20.609636,66.8,15.26,0.879,17.35,1.017,...,1,3.0,0.0,spin coating,NiMgLiO,Ag,Inverted planar,1000,97,97.0


In [16]:
sta_vals = use_df.exposed_time.values
exp_time_main = []
for val in sta_vals:
    try:
        k = float(val)
        exp_time_main.append(k)
    except:
        pass
temp_df = pd.DataFrame(exp_time_main)
print(temp_df.median())
use_df["exposed_time"].replace("unknown",720.0,inplace=True)
use_df["exposed_time"].replace("unkown",720.0,inplace=True)

0    720.0
dtype: float64


In [17]:
use_df.head()

Unnamed: 0,Efficiency (%),Voc (V),Voc per cell (V),Isc (mA),Jsc (mA/cm^2),FF (%),Pmax (mW),Vmax (V),Imax (mA),Apeture area (cm^2),...,Pb,I,Br,Perovksite depostion method,Hole transport layer type,Conductive electrode layer,Device structure,exposed_time,retained_per,stability
0,14.14,1.0071,1.0071,4.46,21.339713,65.7,2.96,0.7763,3.81,0.209,...,1,3.0,0.0,spin coating,Spiro-OMeTAD,Au,Mesoscopic,500,80,40.0
1,16.15,1.1085,1.1085,1.84,19.616205,74.2,1.51,0.9052,1.67,0.0938,...,1,2.58,0.42,spin coating,PTAA,Au,Mesoscopic,720,unknown,64.8
2,12.84,0.8579,0.8579,1.65,22.758621,65.8,0.93,0.654,1.42,0.0725,...,1,3.0,0.0,screen printing,No HTL,carbon black/graphite,Mesoscopic,1008,100,100.8
3,20.11,1.059,1.059,2.354,24.649215,77.0,1.92,0.894,2.148,0.0955,...,1,2.85,0.15,spin coating,PTAA,Au,Mesoscopic,720,unknown,64.8
4,15.0,1.09,1.09,20.96,20.609636,66.8,15.26,0.879,17.35,1.017,...,1,3.0,0.0,spin coating,NiMgLiO,Ag,Inverted planar,1000,97,97.0


In [18]:
sta_vals = use_df.retained_per.values
retain_main = []
for val in sta_vals:
    try:
        k = float(val)
        retain_main.append(k)
    except:
        pass
temp_df = pd.DataFrame(retain_main)
print(temp_df.median())

0    91.0
dtype: float64


In [19]:
use_df["retained_per"].replace("unknown",720.0,inplace=True)
use_df["retained_per"].replace("unkown",720.0,inplace=True)

In [20]:
use_df.head()

Unnamed: 0,Efficiency (%),Voc (V),Voc per cell (V),Isc (mA),Jsc (mA/cm^2),FF (%),Pmax (mW),Vmax (V),Imax (mA),Apeture area (cm^2),...,Pb,I,Br,Perovksite depostion method,Hole transport layer type,Conductive electrode layer,Device structure,exposed_time,retained_per,stability
0,14.14,1.0071,1.0071,4.46,21.339713,65.7,2.96,0.7763,3.81,0.209,...,1,3.0,0.0,spin coating,Spiro-OMeTAD,Au,Mesoscopic,500,80,40.0
1,16.15,1.1085,1.1085,1.84,19.616205,74.2,1.51,0.9052,1.67,0.0938,...,1,2.58,0.42,spin coating,PTAA,Au,Mesoscopic,720,720,64.8
2,12.84,0.8579,0.8579,1.65,22.758621,65.8,0.93,0.654,1.42,0.0725,...,1,3.0,0.0,screen printing,No HTL,carbon black/graphite,Mesoscopic,1008,100,100.8
3,20.11,1.059,1.059,2.354,24.649215,77.0,1.92,0.894,2.148,0.0955,...,1,2.85,0.15,spin coating,PTAA,Au,Mesoscopic,720,720,64.8
4,15.0,1.09,1.09,20.96,20.609636,66.8,15.26,0.879,17.35,1.017,...,1,3.0,0.0,spin coating,NiMgLiO,Ag,Inverted planar,1000,97,97.0


In [21]:
use_df=change_dtype(use_df)
use_df=make_classes(use_df)

In [22]:
use_df.columns

Index(['Efficiency (%)', 'Voc (V)', 'Voc per cell (V)', 'Isc (mA)',
       'Jsc (mA/cm^2)', 'FF (%)', 'Pmax (mW)', 'Vmax (V)', 'Imax (mA)',
       'Apeture area (cm^2)', 'Device size type', 'Sweep direction',
       'Transparent conductive oxide type', 'Electron transport layer type',
       'MA', 'FA', 'Pb', 'I', 'Br', 'Perovksite depostion method',
       'Hole transport layer type', 'Conductive electrode layer',
       'Device structure', 'exposed_time', 'retained_per', 'stability',
       'cat_efc', 'cat_sta'],
      dtype='object')

In [23]:
num_cols = ['Voc (V)',
       'Voc per cell (V)', 'Isc (mA)', 'Jsc (mA/cm^2)', 'FF (%)', 'Pmax (mW)',
       'Vmax (V)', 'Imax (mA)', 'Apeture area (cm^2)', 'MA', 'FA', 'Pb', 'I', 'Br','exposed_time',
       'retained_per']
cat_cols = ['Device size type','Transparent conductive oxide type', 'Electron transport layer type',
       'Sweep direction',
       'Perovksite depostion method', 'Hole transport layer type',
       'Conductive electrode layer', 'Device structure']
cont_tar_cols= [ 'Efficiency (%)','stability']
cat_tar_cols=['cat_efc','cat_sta']

In [24]:
use_df.head()

Unnamed: 0,Efficiency (%),Voc (V),Voc per cell (V),Isc (mA),Jsc (mA/cm^2),FF (%),Pmax (mW),Vmax (V),Imax (mA),Apeture area (cm^2),...,Br,Perovksite depostion method,Hole transport layer type,Conductive electrode layer,Device structure,exposed_time,retained_per,stability,cat_efc,cat_sta
0,14.14,1.0071,1.0071,4.46,21.339713,65.7,2.96,0.7763,3.81,0.209,...,0.0,spin coating,Spiro-OMeTAD,Au,Mesoscopic,500.0,80.0,40.0,low_efficient,unstable
1,16.15,1.1085,1.1085,1.84,19.616205,74.2,1.51,0.9052,1.67,0.0938,...,0.42,spin coating,PTAA,Au,Mesoscopic,720.0,720.0,64.8,moderate_efficient,unstable
2,12.84,0.8579,0.8579,1.65,22.758621,65.8,0.93,0.654,1.42,0.0725,...,0.0,screen printing,No HTL,carbon black/graphite,Mesoscopic,1008.0,100.0,100.8,low_efficient,low_stabiity
3,20.11,1.059,1.059,2.354,24.649215,77.0,1.92,0.894,2.148,0.0955,...,0.15,spin coating,PTAA,Au,Mesoscopic,720.0,720.0,64.8,high_efficient,unstable
4,15.0,1.09,1.09,20.96,20.609636,66.8,15.26,0.879,17.35,1.017,...,0.0,spin coating,NiMgLiO,Ag,Inverted planar,1000.0,97.0,97.0,low_efficient,unstable


In [25]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder

In [26]:
import joblib

In [27]:
use_df["Vmax (V)"].fillna(use_df["Vmax (V)"].mean(),inplace=True)

In [28]:
use_df["Imax (mA)"].fillna(use_df["Imax (mA)"].mean(),inplace=True)

In [29]:
mod_df = use_df
for col in cat_cols:
    lbl_enc = LabelEncoder()
    mod_df[col] = lbl_enc.fit_transform(mod_df[col])
    joblib.dump(lbl_enc,f"./serializer_object/label_encoder_categorical/{col}_lbl.pkl")

In [30]:
mod_df.head()

Unnamed: 0,Efficiency (%),Voc (V),Voc per cell (V),Isc (mA),Jsc (mA/cm^2),FF (%),Pmax (mW),Vmax (V),Imax (mA),Apeture area (cm^2),...,Br,Perovksite depostion method,Hole transport layer type,Conductive electrode layer,Device structure,exposed_time,retained_per,stability,cat_efc,cat_sta
0,14.14,1.0071,1.0071,4.46,21.339713,65.7,2.96,0.7763,3.81,0.209,...,0.0,6,5,1,1,500.0,80.0,40.0,low_efficient,unstable
1,16.15,1.1085,1.1085,1.84,19.616205,74.2,1.51,0.9052,1.67,0.0938,...,0.42,6,4,1,1,720.0,720.0,64.8,moderate_efficient,unstable
2,12.84,0.8579,0.8579,1.65,22.758621,65.8,0.93,0.654,1.42,0.0725,...,0.0,5,3,6,1,1008.0,100.0,100.8,low_efficient,low_stabiity
3,20.11,1.059,1.059,2.354,24.649215,77.0,1.92,0.894,2.148,0.0955,...,0.15,6,4,1,1,720.0,720.0,64.8,high_efficient,unstable
4,15.0,1.09,1.09,20.96,20.609636,66.8,15.26,0.879,17.35,1.017,...,0.0,6,1,0,0,1000.0,97.0,97.0,low_efficient,unstable


In [31]:
for col in cat_tar_cols:
    lbl_enc = LabelEncoder()
    mod_df[col] = lbl_enc.fit_transform(mod_df[col])
    joblib.dump(lbl_enc,f"./serializer_object/label_encoder_categorical/{col}_lbl.pkl")

In [32]:
mod_df.head()

Unnamed: 0,Efficiency (%),Voc (V),Voc per cell (V),Isc (mA),Jsc (mA/cm^2),FF (%),Pmax (mW),Vmax (V),Imax (mA),Apeture area (cm^2),...,Br,Perovksite depostion method,Hole transport layer type,Conductive electrode layer,Device structure,exposed_time,retained_per,stability,cat_efc,cat_sta
0,14.14,1.0071,1.0071,4.46,21.339713,65.7,2.96,0.7763,3.81,0.209,...,0.0,6,5,1,1,500.0,80.0,40.0,1,2
1,16.15,1.1085,1.1085,1.84,19.616205,74.2,1.51,0.9052,1.67,0.0938,...,0.42,6,4,1,1,720.0,720.0,64.8,2,2
2,12.84,0.8579,0.8579,1.65,22.758621,65.8,0.93,0.654,1.42,0.0725,...,0.0,5,3,6,1,1008.0,100.0,100.8,1,0
3,20.11,1.059,1.059,2.354,24.649215,77.0,1.92,0.894,2.148,0.0955,...,0.15,6,4,1,1,720.0,720.0,64.8,0,2
4,15.0,1.09,1.09,20.96,20.609636,66.8,15.26,0.879,17.35,1.017,...,0.0,6,1,0,0,1000.0,97.0,97.0,1,2


In [33]:
from sklearn.preprocessing import StandardScaler

In [34]:
sc=StandardScaler()
mod_df=sc.fit_transform(mod_df)

In [35]:
mod_df=pd.DataFrame(mod_df)

In [36]:
mod_df.columns = use_df.columns

In [37]:
mod_df.head()

Unnamed: 0,Efficiency (%),Voc (V),Voc per cell (V),Isc (mA),Jsc (mA/cm^2),FF (%),Pmax (mW),Vmax (V),Imax (mA),Apeture area (cm^2),...,Br,Perovksite depostion method,Hole transport layer type,Conductive electrode layer,Device structure,exposed_time,retained_per,stability,cat_efc,cat_sta
0,-1.267919,-0.323464,-1.276262,-0.33717,-0.523931,-1.284722,-0.314903,-0.356826,-0.354378,-0.327532,...,-0.629548,0.268293,0.458,-0.282809,-0.576297,-0.370628,-0.612762,-0.400888,0.146095,0.394132
1,-0.69523,-0.291862,0.132195,-0.400656,-1.269457,-0.014569,-0.322863,-0.30678,-0.415367,-0.337084,...,0.133282,0.268293,-0.340564,-0.282809,-0.576297,-0.240289,1.731626,-0.23971,1.249928,0.394132
2,-1.638315,-0.369964,-3.348665,-0.40526,0.089836,-1.269779,-0.326046,-0.40431,-0.422492,-0.33885,...,-0.629548,-0.560976,-1.139129,3.032884,-0.576297,-0.069662,-0.5395,-0.00574,0.146095,-2.955988
3,0.433053,-0.307289,-0.555366,-0.388201,0.907637,0.403834,-0.320612,-0.311128,-0.401744,-0.336943,...,-0.357109,0.268293,-0.340564,-0.282809,-0.576297,-0.240289,1.731626,-0.23971,-0.957737,0.394132
4,-1.022888,-0.297628,-0.124772,0.062648,-0.839735,-1.12035,-0.247387,-0.316952,0.031508,-0.260531,...,-0.629548,0.268293,-2.736258,-0.945948,-2.027712,-0.074402,-0.55049,-0.030437,0.146095,0.394132


In [38]:
mod_df.to_csv("./main_datasets/completely_scaled_data.csv",index=False)
import numpy as np

In [39]:
y_reg = np.array(mod_df[cont_tar_cols])
y_cla_efc = np.array(mod_df[cat_tar_cols]['cat_efc'])

In [43]:
len(cat_cols),len(num_cols)

(8, 16)

In [44]:
main_x = np.array(mod_df[num_cols+cat_cols])

In [45]:
from imblearn.over_sampling import SMOTENC

In [46]:
smote = SMOTENC(categorical_features=cat_cols,sampling_strategy='auto',random_state=123,k_neighbors=3,n_jobs=3)

In [47]:
lbl_2 = LabelEncoder()
y_cla_efc =pd.DataFrame(y_cla_efc)
y_cla_efc = lbl_2.fit_transform(y_cla_efc)

  return f(*args, **kwargs)


In [48]:
y_cla_efc = np.array(y_cla_efc)
y_cla_efc

array([1, 2, 1, 0, 1, 2, 2, 0, 2, 2, 2, 0, 2, 1, 2, 0, 2, 2, 1, 0, 2, 0,
       1, 2, 1, 0, 0, 2, 2, 0, 0, 2, 1, 1, 0, 1, 0, 0, 2, 0, 2, 0, 0, 0,
       1, 0, 0, 2, 2, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 2, 0, 0, 2, 0, 0,
       0, 0], dtype=int64)

In [70]:
# x,y = smote.fit_resample(main_x,y_cla_efc)

In [50]:
samp_df = pd.DataFrame(main_x)

In [51]:
samp_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,-0.323464,-1.276262,-0.337170,-0.523931,-1.284722,-0.314903,-0.356826,-0.354378,-0.327532,1.083360,...,-0.370628,-0.612762,0.157027,-0.811503,1.150045,0.300300,0.268293,0.458000,-0.282809,-0.576297
1,-0.291862,0.132195,-0.400656,-1.269457,-0.014569,-0.322863,-0.306780,-0.415367,-0.337084,1.083360,...,-0.240289,1.731626,0.157027,-0.811503,1.150045,0.300300,0.268293,-0.340564,-0.282809,-0.576297
2,-0.369964,-3.348665,-0.405260,0.089836,-1.269779,-0.326046,-0.404310,-0.422492,-0.338850,0.885358,...,-0.069662,-0.539500,0.157027,-0.811503,1.150045,0.300300,-0.560976,-1.139129,3.032884,-0.576297
3,-0.307289,-0.555366,-0.388201,0.907637,0.403834,-0.320612,-0.311128,-0.401744,-0.336943,-0.797655,...,-0.240289,1.731626,0.157027,-0.811503,1.150045,-1.061062,0.268293,-0.340564,-0.282809,-0.576297
4,-0.297628,-0.124772,0.062648,-0.839735,-1.120350,-0.247387,-0.316952,0.031508,-0.260531,1.083360,...,-0.074402,-0.550490,-1.622614,-0.811503,-1.577969,0.980981,0.268293,-2.736258,-0.945948,-2.027712
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63,-0.280486,0.639183,-0.416843,-2.210596,0.553264,-0.325410,-0.289386,-0.431583,-0.339290,-0.896656,...,-0.459496,-0.594447,0.157027,1.232282,-1.577969,0.980981,0.268293,-1.937693,-0.945948,-2.027712
64,-0.275032,0.882260,-0.391894,0.966122,0.587632,-0.319064,-0.277738,-0.404737,-0.337496,0.093352,...,-0.560213,-0.572468,0.157027,1.232282,-0.668631,0.300300,0.268293,0.458000,-0.282809,0.875118
65,-0.281141,0.610014,-0.406568,0.564739,1.150983,-0.322939,-0.275797,-0.419699,-0.339315,-0.738255,...,-0.074402,-0.539500,0.157027,1.232282,-1.577969,1.661662,0.268293,-0.340564,1.706607,-2.027712
66,-0.287655,0.319711,-0.369955,1.136492,0.045202,-0.316874,-0.299092,-0.382849,-0.334630,-0.896656,...,-0.665907,-0.568805,0.157027,-0.811503,-0.668631,1.661662,0.268293,0.458000,-0.945948,0.875118


In [52]:
smote = SMOTENC(categorical_features=[i for i in range(16,24)],sampling_strategy='auto',random_state=123,k_neighbors=3,n_jobs=3)

In [53]:
x,y = smote.fit_resample(main_x,y_cla_efc)

In [54]:
len(x),len(y)

(99, 99)

In [55]:
smoted_data = pd.DataFrame(x)

In [56]:
smoted_data.columns = mod_df[num_cols+cat_cols].columns

In [57]:
smoted_data

Unnamed: 0,Voc (V),Voc per cell (V),Isc (mA),Jsc (mA/cm^2),FF (%),Pmax (mW),Vmax (V),Imax (mA),Apeture area (cm^2),MA,...,exposed_time,retained_per,Device size type,Transparent conductive oxide type,Electron transport layer type,Sweep direction,Perovksite depostion method,Hole transport layer type,Conductive electrode layer,Device structure
0,-0.323464,-1.276262,-0.337170,-0.523931,-1.284722,-0.314903,-0.356826,-0.354378,-0.327532,1.083360,...,-0.370628,-0.612762,0.157027,-0.811503,1.150045,0.300300,0.268293,0.458000,-0.282809,-0.576297
1,-0.291862,0.132195,-0.400656,-1.269457,-0.014569,-0.322863,-0.306780,-0.415367,-0.337084,1.083360,...,-0.240289,1.731626,0.157027,-0.811503,1.150045,0.300300,0.268293,-0.340564,-0.282809,-0.576297
2,-0.369964,-3.348665,-0.405260,0.089836,-1.269779,-0.326046,-0.404310,-0.422492,-0.338850,0.885358,...,-0.069662,-0.539500,0.157027,-0.811503,1.150045,0.300300,-0.560976,-1.139129,3.032884,-0.576297
3,-0.307289,-0.555366,-0.388201,0.907637,0.403834,-0.320612,-0.311128,-0.401744,-0.336943,-0.797655,...,-0.240289,1.731626,0.157027,-0.811503,1.150045,-1.061062,0.268293,-0.340564,-0.282809,-0.576297
4,-0.297628,-0.124772,0.062648,-0.839735,-1.120350,-0.247387,-0.316952,0.031508,-0.260531,1.083360,...,-0.074402,-0.550490,-1.622614,-0.811503,-1.577969,0.980981,0.268293,-2.736258,-0.945948,-2.027712
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,-0.298812,-0.177536,-0.096992,0.366470,-0.132254,-0.268200,-0.310541,-0.090827,-0.296378,-0.211632,...,2.680227,-0.569844,-1.622614,-0.811503,1.150045,-0.380381,0.268293,0.458000,-0.282809,0.875118
95,-0.287109,0.344047,-0.218827,0.820862,-0.678832,-0.288675,-0.310957,-0.223116,-0.309760,-0.599654,...,-0.240289,1.731626,0.157027,-0.811503,1.150045,-1.061062,0.268293,0.458000,-0.282809,-0.576297
96,-0.311778,-0.755407,-0.160220,-0.843173,-0.097347,-0.327055,-0.155776,-0.203835,-0.297203,1.067269,...,-0.499420,-0.593224,-1.622614,-0.811503,-1.577969,-0.380381,0.268293,-2.736258,-0.945948,-2.027712
97,-0.283222,0.517268,0.153217,-0.434784,0.132713,-0.215527,-0.294724,0.178749,-0.249721,-0.620222,...,-0.240289,1.731626,0.157027,-0.811503,1.150045,-1.061062,0.268293,0.458000,-0.282809,-0.576297


In [58]:
target_data = pd.DataFrame(y)

In [59]:
smoted_data["efficiency"] = target_data

In [60]:
smoted_data.head()

Unnamed: 0,Voc (V),Voc per cell (V),Isc (mA),Jsc (mA/cm^2),FF (%),Pmax (mW),Vmax (V),Imax (mA),Apeture area (cm^2),MA,...,retained_per,Device size type,Transparent conductive oxide type,Electron transport layer type,Sweep direction,Perovksite depostion method,Hole transport layer type,Conductive electrode layer,Device structure,efficiency
0,-0.323464,-1.276262,-0.33717,-0.523931,-1.284722,-0.314903,-0.356826,-0.354378,-0.327532,1.08336,...,-0.612762,0.157027,-0.811503,1.150045,0.3003,0.268293,0.458,-0.282809,-0.576297,1
1,-0.291862,0.132195,-0.400656,-1.269457,-0.014569,-0.322863,-0.30678,-0.415367,-0.337084,1.08336,...,1.731626,0.157027,-0.811503,1.150045,0.3003,0.268293,-0.340564,-0.282809,-0.576297,2
2,-0.369964,-3.348665,-0.40526,0.089836,-1.269779,-0.326046,-0.40431,-0.422492,-0.33885,0.885358,...,-0.5395,0.157027,-0.811503,1.150045,0.3003,-0.560976,-1.139129,3.032884,-0.576297,1
3,-0.307289,-0.555366,-0.388201,0.907637,0.403834,-0.320612,-0.311128,-0.401744,-0.336943,-0.797655,...,1.731626,0.157027,-0.811503,1.150045,-1.061062,0.268293,-0.340564,-0.282809,-0.576297,0
4,-0.297628,-0.124772,0.062648,-0.839735,-1.12035,-0.247387,-0.316952,0.031508,-0.260531,1.08336,...,-0.55049,-1.622614,-0.811503,-1.577969,0.980981,0.268293,-2.736258,-0.945948,-2.027712,1


In [61]:
smoted_data.to_csv("./main_datasets/smoted_scaled_efficiency_cat_data.csv",index=False)

# UNWRAPING THE DATA INTO RAW FORM

In [63]:
y = smoted_data["efficiency"]

In [65]:
r_y = lbl_2.inverse_transform(y)

In [68]:
new_smoted_data = pd.DataFrame(r_y)
new_smoted_data.columns=["efficiency"]

In [69]:
new_smoted_data

Unnamed: 0,efficiency
0,0.146095
1,1.249928
2,0.146095
3,-0.957737
4,0.146095
...,...
94,1.249928
95,1.249928
96,1.249928
97,1.249928


In [84]:
x = smoted_data

In [86]:
x.drop("efficiency",axis=1,inplace=True)

In [89]:
sc.inverse_transform(x)

ValueError: operands could not be broadcast together with shapes (99,24) (28,) (99,24) 