### 1) Data Preparation

In [21]:
import pandas as pd
path = "C:/Users/ppirthip/OneDrive - Intel Corporation/Speckle/"
df = pd.read_csv(path+"SNR_R4_ww49.1_Speckle.csv.gz")

In [None]:
#create new column for target
# import numpy as np
# df["SPECKLE"] = np.where(df["DELTA"]==0,0,1)
# cols = df.columns.tolist()
# cols = [cols[-1]]+cols[:-1] #move speckle col to the front
# df = df.reindex(columns=cols)

In [22]:
df["DELTA"].value_counts()

0    13390
1     6317
2      129
Name: DELTA, dtype: int64

In [23]:
df["SPECKLE"].value_counts()

0    13390
1     6446
Name: SPECKLE, dtype: int64

In [None]:
df.head()

In [None]:
pd.options.display.max_seq_items = 2000
df.columns

In [None]:
df.shape

In [None]:
df.describe()

### Duplicate handling

In [24]:
def duplicates(df,keep):
    """
    Check for duplicates in VID and remove duplicates if there is any
    df[DataFrame]: input df
    keep: {‘first’, ‘last’, False}: Determines which duplicates (if any) to keep. 
    - first : Drop duplicates except for the first occurrence. 
    - last : Drop duplicates except for the last occurrence. 
    - False : Drop all duplicates.
    """
    if df['VID'].duplicated().any():
        print("There are duplicates in VID")
        df_dup = df[df.duplicated(subset=['VID'])] #get the duplicates in df 
        display(df_dup)
        df = df.drop_duplicates(subset=['VID'],keep=keep) #remove duplicates from df
        return df
    else:
        print("There are no duplicates in VID")
    

In [25]:
duplicates(df,keep="first")

There are no duplicates in VID


### 2) NA handling

In [26]:
df.shape

(19836, 1615)

In [27]:
# check column for nulls
def check_colna(df):
    """
    Check each column for nulls. Returns Feature, total null and % of null for each column
    df[DataFrame]: dataframe
    
    """
    colna_df = pd.DataFrame(columns =["Feature", "Total Null", "% of Null"])
    for col in df.columns: 
        #checking if there is any null in the column

        if df[col].isnull().sum()>0: 
            
            # if null present, total number of null in the column stores here
            total_null = df[col].isnull().sum() 
            new_row = {'Feature':col, 'Total Null':total_null, '% of Null':round(total_null*100/len(df),2)}
            #append row to the dataframe
            colna_df = colna_df.append(new_row, ignore_index=True)
            
    colna_df= colna_df.sort_values("% of Null", ascending=False)    
    return colna_df  

# check rows for nulls
def check_rowna(df,supporting_fs):  
    """
    Check each row for nulls.Returns VID, total null and % of null for each row
    df[DataFrame]: dataframe
    supporting_fs[list]: all features not used for ML except VID and SPECKLE(target)
    """
    df = df.drop(supporting_fs,axis=1)    
    colrow_df = pd.DataFrame(columns =["SPECKLE","VID", "Total Null", "% of Null"])
    for i in df.index: 
        #checking if there is any null in the row
        if df.iloc[i].isnull().sum()>0:             
            # if null present, total number of null in the row stores here
            total_null = df.iloc[i].isnull().sum() 
            new_row = {'SPECKLE':df.iloc[i,0],'VID':df.iloc[i,1], 'Total Null':total_null, '% of Null':round(total_null*100/(len(df.columns)-2),2)}
            #append row to the dataframe
            colrow_df = colrow_df.append(new_row, ignore_index=True)
            
    colrow_df= colrow_df.sort_values("% of Null", ascending=False)    
    return colrow_df   

# Drop columns based on NA threshold limit
def drop_NAcol(df,NA_limit):
    '''
    Drops columns based on proportion of NA in column
    df[DataFrame]: df
    NA_limit[float/int]: Columns with proportion of NA above NA_limit will be dropped
    '''
    threshold = len(df)*(1-NA_limit)
    df=df.dropna(axis=1, thresh=threshold)
    print(df.shape)
    return df


#### a) Check column for nulls

In [28]:
#check column na for whole df
colna_df =check_colna(df)
colna_df

Unnamed: 0,Feature,Total Null,% of Null
1230,SIO_LKG_X::HI1P05_X_LKG_K_END_X_X_VMAX_X_1MA_P...,16129,81.31
0,IDV_0001_SVT3GNES12_FULLDIE_SOC_0650_MED@132110,13847,69.81
399,IDV_0134_PSVT3GVTO12_FULLDIE_TALL_0950_MED@132110,13847,69.81
392,IDV_0134_PSVT3GVTO12_FULLDIE_CORE_TALL_0650_ME...,13847,69.81
393,IDV_0134_PSVT3GVTO12_FULLDIE_CORE_TALL_0950_ME...,13847,69.81
...,...,...,...
1472,SIO_LKG_X::HI1P05_X_LKG_K_END_X_X_VMAX_X_1MA_P...,13459,67.85
1473,SIO_LKG_X::HI1P05_X_LKG_K_END_X_X_VMAX_X_1MA_P...,13459,67.85
1407,SIO_LKG_X::HI1P05_X_LKG_K_END_X_X_VMAX_X_1MA_P...,13459,67.85
1406,SIO_LKG_X::HI1P05_X_LKG_K_END_X_X_VMAX_X_1MA_P...,13459,67.85


In [None]:
# colna_df.to_csv(path+"/DataPreparation/NA_Cols.csv",index=False)

In [29]:
#check column na for speckle
colna_speckle_df =check_colna(df.loc[df['SPECKLE'] == 1])
colna_speckle_df

Unnamed: 0,Feature,Total Null,% of Null
1230,SIO_LKG_X::HI1P05_X_LKG_K_END_X_X_VMAX_X_1MA_P...,5268,81.73
0,IDV_0001_SVT3GNES12_FULLDIE_SOC_0650_MED@132110,4845,75.16
399,IDV_0134_PSVT3GVTO12_FULLDIE_TALL_0950_MED@132110,4845,75.16
392,IDV_0134_PSVT3GVTO12_FULLDIE_CORE_TALL_0650_ME...,4845,75.16
393,IDV_0134_PSVT3GVTO12_FULLDIE_CORE_TALL_0950_ME...,4845,75.16
...,...,...,...
1472,SIO_LKG_X::HI1P05_X_LKG_K_END_X_X_VMAX_X_1MA_P...,4677,72.56
1473,SIO_LKG_X::HI1P05_X_LKG_K_END_X_X_VMAX_X_1MA_P...,4677,72.56
1406,SIO_LKG_X::HI1P05_X_LKG_K_END_X_X_VMAX_X_1MA_P...,4677,72.56
1405,SIO_LKG_X::HI1P05_X_LKG_K_END_X_X_VMAX_X_1MA_P...,4677,72.56


In [30]:
#check column na for non-speckle
colna_nonspeckle_df =check_colna(df.loc[df['SPECKLE'] == 0])
colna_nonspeckle_df

Unnamed: 0,Feature,Total Null,% of Null
1230,SIO_LKG_X::HI1P05_X_LKG_K_END_X_X_VMAX_X_1MA_P...,10861,81.11
875,TPI_SIU_STATIC::SIU_STATIC_AM_K_STRESS_X_X_X_S...,9006,67.26
882,TPI_SIU_STATIC::SIU_STATIC_AM_K_STRESS_X_X_X_S...,9006,67.26
889,TPI_SIU_STATIC::SIU_STATIC_AM_K_STRESS_X_X_X_S...,9006,67.26
888,TPI_SIU_STATIC::SIU_STATIC_AM_K_STRESS_X_X_X_S...,9006,67.26
...,...,...,...
820,TPI_SIU_STATIC::SIU_STATIC_AM_K_START_X_X_X_SN...,8782,65.59
822,TPI_SIU_STATIC::SIU_STATIC_AM_K_START_X_X_X_SN...,8782,65.59
824,TPI_SIU_STATIC::SIU_STATIC_AM_K_START_X_X_X_SN...,8782,65.59
826,TPI_SIU_STATIC::SIU_STATIC_AM_K_START_X_X_X_SN...,8782,65.59


#### b) Check row for nulls

In [None]:
supporting_fs = ['ULT@MIDAS_6261_U1', 'SORTLOT', 'SORTLOT7', 'WAFER',
       'XLOC', 'YLOC', 'ULT@MIDAS_6261_U2', 'IB@6261[CLASSHOT]',
       'FB@6261[CLASSHOT]','ARR_SPECKLE_SCREEN::PHYTON_X_USERFUNC_K_END_X_X_X_X_PBISTSPECKLE_INDICATOR::DUTCPUBAD_1@6261[CLASSHOT]',
       'TEST RESULTS', 'TEST RESULTS BITS', 'TR_BITS', 'INCOMING',
       'INCOMING BITS', 'INC_BITS', 'OUTGOING', 'OUTGOING BITS', 'OUT_BITS','DELTA']
rowna_df = check_rowna(df,supporting_fs)
rowna_df

In [None]:
rowna_df.loc[rowna_df["% of Null"] ==100.00]

In [None]:
# rowna_df.to_csv(path+"/DataPreparation/NA_Rows_aftercolremoval.csv",index=False)

#### c) Drop columns based on threshold limit

In [None]:
df = drop_NAcol(df,0.8) #drop columns with >80% NA 

In [None]:
check_colna(df)

#### d) Row NA handling

In [31]:
#check rows for NA after removing columns
supporting_fs = ['ULT@MIDAS_6261_U1', 'SORTLOT', 'SORTLOT7', 'WAFER',
       'XLOC', 'YLOC', 'ULT@MIDAS_6261_U2', 'IB@6261[CLASSHOT]',
       'FB@6261[CLASSHOT]','ARR_SPECKLE_SCREEN::PHYTON_X_USERFUNC_K_END_X_X_X_X_PBISTSPECKLE_INDICATOR::DUTCPUBAD_1@6261[CLASSHOT]',
       'TEST RESULTS', 'TEST RESULTS BITS', 'TR_BITS', 'INCOMING',
       'INCOMING BITS', 'INC_BITS', 'OUTGOING', 'OUTGOING BITS', 'OUT_BITS','DELTA']
rowna_df = check_rowna(df,supporting_fs)
rowna_df

Unnamed: 0,SPECKLE,VID,Total Null,% of Null
8213,0,M1GC346401871,1593,100.00
10375,0,M11H9B4300835,1593,100.00
10377,0,M11H9B4301228,1593,100.00
10378,0,M11H9B4300032,1593,100.00
10379,0,M11H9B4301227,1593,100.00
...,...,...,...,...
11806,0,M05C0K5600161,1,0.06
11811,0,M0U25B1100396,1,0.06
11814,0,M0U25B1100907,1,0.06
11815,0,M0U25B1100798,1,0.06


In [32]:
#get rows with 100% NA
rowallNA_df = rowna_df.loc[rowna_df["% of Null"] ==100.00]
rowallNA_df

Unnamed: 0,SPECKLE,VID,Total Null,% of Null
8213,0,M1GC346401871,1593,100.0
10375,0,M11H9B4300835,1593,100.0
10377,0,M11H9B4301228,1593,100.0
10378,0,M11H9B4300032,1593,100.0
10379,0,M11H9B4301227,1593,100.0
...,...,...,...,...
5463,0,M10P67G000951,1593,100.0
5464,0,M10P67G001238,1593,100.0
5468,0,M17U1C0300567,1593,100.0
5465,0,M10P67G000323,1593,100.0


In [33]:
#drop rows with 100% NA
df = df[~df["VID"].isin(rowallNA_df["VID"])]
df = df.reset_index(drop=True)
df

Unnamed: 0,SPECKLE,VID,ULT@MIDAS_6261_U1,SORTLOT,SORTLOT7,WAFER,XLOC,YLOC,ULT@MIDAS_6261_U2,IB@6261[CLASSHOT],...,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDCLK1,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDDATA0,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDDATA1,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TAP_DIS_STRAP_N,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TAP_ODT_EN,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TXT_AGENT,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TXT_PLTEN,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VPPSMBUSSCL,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VPPSMBUSSDA,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VSENSEPMAX
0,1,M10P67G000001,N0286920_293_6_-3,N0286920,N028692,293,6,-3,H0280460_443_2_1,1,...,-8.210000e-07,-8.210000e-07,-8.330000e-07,-2.100000e-07,-2.240000e-07,-1.950000e-07,-1.910000e-07,-8.990000e-07,-8.840000e-07,-1.970000e-07
1,1,M10P67G000962,N0286920_225_5_-5,N0286920,N028692,225,5,-5,H0280460_443_2_18,1,...,-8.670000e-07,-8.870000e-07,-8.740000e-07,-2.110000e-07,-2.100000e-07,-1.900000e-07,-2.280000e-07,-8.620000e-07,-8.610000e-07,-1.920000e-07
2,1,M10P67G000808,N0292860_135_6_0,N0292860,N029286,135,6,0,H0256240_667_-10_6,1,...,-6.000000e-07,-6.500000e-07,-6.380000e-07,-1.440000e-07,-1.380000e-07,-1.440000e-07,-1.620000e-07,-5.820000e-07,-6.150000e-07,-1.460000e-07
3,1,M17U1C0300573,N0293360_159_3_1,N0293360,N029336,159,3,1,H0280460_443_0_-14,1,...,-1.170000e-06,-1.220000e-06,-1.230000e-06,-2.950000e-07,-2.930000e-07,-3.190000e-07,-3.070000e-07,-1.220000e-06,-1.190000e-06,-2.960000e-07
4,1,M10P67G000433,N0293340_045_1_5,N0293340,N029334,45,1,5,H0280460_444_9_13,1,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6372,0,M1G88H4701199,N0326460_212_4_6,N0326460,N032646,212,4,6,H0310990_073_5_-2,1,...,-7.190000e-07,-7.620000e-07,-7.720000e-07,-1.810000e-07,-1.860000e-07,-1.720000e-07,-1.780000e-07,-7.080000e-07,-7.430000e-07,-1.720000e-07
6373,0,M1G88H4700911,N0326460_209_-1_-2,N0326460,N032646,209,-1,-2,H0310990_073_-3_18,1,...,-9.140000e-07,-9.750000e-07,-9.730000e-07,-2.420000e-07,-2.400000e-07,-2.330000e-07,-2.310000e-07,-8.940000e-07,-8.840000e-07,-2.310000e-07
6374,0,M1G88H4700361,N0370830_775_-3_0,N0370830,N037083,775,-3,0,H0310990_083_-14_-3,1,...,-1.010000e-06,-9.770000e-07,-1.000000e-06,-2.500000e-07,-2.510000e-07,-3.010000e-07,-2.310000e-07,-9.300000e-07,-9.430000e-07,-2.310000e-07
6375,0,M1G88H4700053,N0342690_197_1_-6,N0342690,N034269,197,1,-6,H0310990_073_-10_15,14,...,-1.060000e-06,-1.120000e-06,-1.130000e-06,-2.610000e-07,-2.700000e-07,-2.700000e-07,-2.720000e-07,-1.090000e-06,-1.110000e-06,-2.270000e-07


In [34]:
df["SPECKLE"].value_counts()

0    4608
1    1769
Name: SPECKLE, dtype: int64

In [35]:
#check row na
supporting_fs = ['ULT@MIDAS_6261_U1', 'SORTLOT', 'SORTLOT7', 'WAFER',
       'XLOC', 'YLOC', 'ULT@MIDAS_6261_U2', 'IB@6261[CLASSHOT]',
       'FB@6261[CLASSHOT]','ARR_SPECKLE_SCREEN::PHYTON_X_USERFUNC_K_END_X_X_X_X_PBISTSPECKLE_INDICATOR::DUTCPUBAD_1@6261[CLASSHOT]',
       'TEST RESULTS', 'TEST RESULTS BITS', 'TR_BITS', 'INCOMING',
       'INCOMING BITS', 'INC_BITS', 'OUTGOING', 'OUTGOING BITS', 'OUT_BITS','DELTA']
check_rowna(df,supporting_fs)

Unnamed: 0,SPECKLE,VID,Total Null,% of Null
573,1,M1888SS100613,1310,82.23
579,1,M1888SS100734,1310,82.23
581,1,M1888SS100614,1310,82.23
582,1,M1888SS100616,1310,82.23
577,1,M1888SS100692,1310,82.23
...,...,...,...,...
1536,0,M0LN682800123,1,0.06
1535,0,M0LN682801353,1,0.06
1534,0,M0LN682801352,1,0.06
1533,0,M0LN682800122,1,0.06


In [36]:
df = df.drop(['PTH_POWER::POWER_X_SCREEN_E_BEGIN_X_X_X_X_CALC_PP_CDYN_INDICATOR_CDYN_DATA@132110'],axis=1)

In [37]:
df.shape

(6377, 1614)

In [38]:
#NA imputation
def NA_impute(df,imptype):
    """
    Impute NA
    df[DataFrame]:df
    imptype[string]: "mean" to impute data with mean, "median" to impute data with median
    """
    if imptype == "mean":        
        df = df.fillna(df.mean())
    if imptype == "median":
        df = df.fillna(df.median())
    return df

In [39]:
df = NA_impute(df,imptype="median")
df

Unnamed: 0,SPECKLE,VID,ULT@MIDAS_6261_U1,SORTLOT,SORTLOT7,WAFER,XLOC,YLOC,ULT@MIDAS_6261_U2,IB@6261[CLASSHOT],...,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDCLK1,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDDATA0,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDDATA1,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TAP_DIS_STRAP_N,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TAP_ODT_EN,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TXT_AGENT,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TXT_PLTEN,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VPPSMBUSSCL,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VPPSMBUSSDA,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VSENSEPMAX
0,1,M10P67G000001,N0286920_293_6_-3,N0286920,N028692,293,6,-3,H0280460_443_2_1,1,...,-8.210000e-07,-8.210000e-07,-8.330000e-07,-2.100000e-07,-2.240000e-07,-1.950000e-07,-1.910000e-07,-8.990000e-07,-8.840000e-07,-1.970000e-07
1,1,M10P67G000962,N0286920_225_5_-5,N0286920,N028692,225,5,-5,H0280460_443_2_18,1,...,-8.670000e-07,-8.870000e-07,-8.740000e-07,-2.110000e-07,-2.100000e-07,-1.900000e-07,-2.280000e-07,-8.620000e-07,-8.610000e-07,-1.920000e-07
2,1,M10P67G000808,N0292860_135_6_0,N0292860,N029286,135,6,0,H0256240_667_-10_6,1,...,-6.000000e-07,-6.500000e-07,-6.380000e-07,-1.440000e-07,-1.380000e-07,-1.440000e-07,-1.620000e-07,-5.820000e-07,-6.150000e-07,-1.460000e-07
3,1,M17U1C0300573,N0293360_159_3_1,N0293360,N029336,159,3,1,H0280460_443_0_-14,1,...,-1.170000e-06,-1.220000e-06,-1.230000e-06,-2.950000e-07,-2.930000e-07,-3.190000e-07,-3.070000e-07,-1.220000e-06,-1.190000e-06,-2.960000e-07
4,1,M10P67G000433,N0293340_045_1_5,N0293340,N029334,45,1,5,H0280460_444_9_13,1,...,-8.520000e-07,-8.570000e-07,-8.640000e-07,-2.150000e-07,-2.250000e-07,-2.190000e-07,-2.170000e-07,-8.545000e-07,-8.510000e-07,-2.130000e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6372,0,M1G88H4701199,N0326460_212_4_6,N0326460,N032646,212,4,6,H0310990_073_5_-2,1,...,-7.190000e-07,-7.620000e-07,-7.720000e-07,-1.810000e-07,-1.860000e-07,-1.720000e-07,-1.780000e-07,-7.080000e-07,-7.430000e-07,-1.720000e-07
6373,0,M1G88H4700911,N0326460_209_-1_-2,N0326460,N032646,209,-1,-2,H0310990_073_-3_18,1,...,-9.140000e-07,-9.750000e-07,-9.730000e-07,-2.420000e-07,-2.400000e-07,-2.330000e-07,-2.310000e-07,-8.940000e-07,-8.840000e-07,-2.310000e-07
6374,0,M1G88H4700361,N0370830_775_-3_0,N0370830,N037083,775,-3,0,H0310990_083_-14_-3,1,...,-1.010000e-06,-9.770000e-07,-1.000000e-06,-2.500000e-07,-2.510000e-07,-3.010000e-07,-2.310000e-07,-9.300000e-07,-9.430000e-07,-2.310000e-07
6375,0,M1G88H4700053,N0342690_197_1_-6,N0342690,N034269,197,1,-6,H0310990_073_-10_15,14,...,-1.060000e-06,-1.120000e-06,-1.130000e-06,-2.610000e-07,-2.700000e-07,-2.700000e-07,-2.720000e-07,-1.090000e-06,-1.110000e-06,-2.270000e-07


In [40]:
check_rowna(df,supporting_fs)

Unnamed: 0,SPECKLE,VID,Total Null,% of Null


### 3) Drop supporting features

In [41]:
supporting_fs = ['VID','ULT@MIDAS_6261_U1', 'SORTLOT', 'SORTLOT7', 'WAFER',
       'XLOC', 'YLOC', 'ULT@MIDAS_6261_U2', 'IB@6261[CLASSHOT]',
       'FB@6261[CLASSHOT]','ARR_SPECKLE_SCREEN::PHYTON_X_USERFUNC_K_END_X_X_X_X_PBISTSPECKLE_INDICATOR::DUTCPUBAD_1@6261[CLASSHOT]',
       'TEST RESULTS', 'TEST RESULTS BITS', 'TR_BITS', 'INCOMING',
       'INCOMING BITS', 'INC_BITS', 'OUTGOING', 'OUTGOING BITS', 'OUT_BITS','DELTA']
df_sf = df[supporting_fs] #keep supporting fs
df = df.drop(supporting_fs,axis=1)
df

Unnamed: 0,SPECKLE,IDV_0001_SVT3GNES12_FULLDIE_SOC_0650_MED@132110,IDV_0001_SVT3GNES12_FULLDIE_SOC_0950_MED@132110,IDV_0001_SVT3GNES12_FULLDIE_SOC_TALL_0650_MED@132110,IDV_0001_SVT3GNES12_FULLDIE_SOC_TALL_0950_MED@132110,IDV_0002_SVT3GNES12_FULLDIE_SOC_0650_MED@132110,IDV_0002_SVT3GNES12_FULLDIE_SOC_0950_MED@132110,IDV_0002_SVT3GNES12_FULLDIE_SOC_TALL_0650_MED@132110,IDV_0002_SVT3GNES12_FULLDIE_SOC_TALL_0950_MED@132110,IDV_0003_SVT3GSEM12_FULLDIE_SOC_0650_MED@132110,...,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDCLK1,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDDATA0,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDDATA1,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TAP_DIS_STRAP_N,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TAP_ODT_EN,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TXT_AGENT,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TXT_PLTEN,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VPPSMBUSSCL,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VPPSMBUSSDA,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VSENSEPMAX
0,1,6016.90,12304.2,5791.55,12112.7,5994.37,12293.0,5887.32,12214.1,5245.07,...,-8.210000e-07,-8.210000e-07,-8.330000e-07,-2.100000e-07,-2.240000e-07,-1.950000e-07,-1.910000e-07,-8.990000e-07,-8.840000e-07,-1.970000e-07
1,1,6073.24,12214.1,6005.63,12163.4,6078.87,12174.7,6061.97,12236.6,5295.77,...,-8.670000e-07,-8.870000e-07,-8.740000e-07,-2.110000e-07,-2.100000e-07,-1.900000e-07,-2.280000e-07,-8.620000e-07,-8.610000e-07,-1.920000e-07
2,1,6343.66,12231.0,6163.38,12028.2,6264.79,12152.1,6219.72,12129.6,5695.77,...,-6.000000e-07,-6.500000e-07,-6.380000e-07,-1.440000e-07,-1.380000e-07,-1.440000e-07,-1.620000e-07,-5.820000e-07,-6.150000e-07,-1.460000e-07
3,1,5566.20,11453.5,5442.25,11357.8,5554.93,11414.1,5476.06,11391.5,4811.27,...,-1.170000e-06,-1.220000e-06,-1.230000e-06,-2.950000e-07,-2.930000e-07,-3.190000e-07,-3.070000e-07,-1.220000e-06,-1.190000e-06,-2.960000e-07
4,1,6247.89,12293.0,6112.68,12180.3,6219.72,12270.4,6163.38,12259.2,5470.42,...,-8.520000e-07,-8.570000e-07,-8.640000e-07,-2.150000e-07,-2.250000e-07,-2.190000e-07,-2.170000e-07,-8.545000e-07,-8.510000e-07,-2.130000e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6372,0,6664.79,12450.7,6602.82,12371.8,6670.42,12462.0,6636.62,12416.9,5954.93,...,-7.190000e-07,-7.620000e-07,-7.720000e-07,-1.810000e-07,-1.860000e-07,-1.720000e-07,-1.780000e-07,-7.080000e-07,-7.430000e-07,-1.720000e-07
6373,0,6208.45,12056.3,6095.77,12000.0,6163.38,12056.3,6185.92,12078.9,5453.52,...,-9.140000e-07,-9.750000e-07,-9.730000e-07,-2.420000e-07,-2.400000e-07,-2.330000e-07,-2.310000e-07,-8.940000e-07,-8.840000e-07,-2.310000e-07
6374,0,6202.82,12169.0,6163.38,12118.3,6253.52,12185.9,6264.79,12247.9,5526.76,...,-1.010000e-06,-9.770000e-07,-1.000000e-06,-2.500000e-07,-2.510000e-07,-3.010000e-07,-2.310000e-07,-9.300000e-07,-9.430000e-07,-2.310000e-07
6375,0,5526.76,11549.3,5397.18,11493.0,5464.79,11509.9,5459.15,11554.9,4867.61,...,-1.060000e-06,-1.120000e-06,-1.130000e-06,-2.610000e-07,-2.700000e-07,-2.700000e-07,-2.720000e-07,-1.090000e-06,-1.110000e-06,-2.270000e-07


### 4) Converting negative to positive values

In [65]:
#To create a dataframe df_keep with IDV and HVQK token family columns
cols_to_keep=["IDV", "HVQK"]
IDV_HVQK_df = df[df.columns[df.columns.str.startswith(tuple(cols_to_keep))]]
df_keep = IDV_HVQK_df

In [66]:
df_keep.head()

Unnamed: 0,IDV_0001_SVT3GNES12_FULLDIE_SOC_0650_MED@132110,IDV_0001_SVT3GNES12_FULLDIE_SOC_0950_MED@132110,IDV_0001_SVT3GNES12_FULLDIE_SOC_TALL_0650_MED@132110,IDV_0001_SVT3GNES12_FULLDIE_SOC_TALL_0950_MED@132110,IDV_0002_SVT3GNES12_FULLDIE_SOC_0650_MED@132110,IDV_0002_SVT3GNES12_FULLDIE_SOC_0950_MED@132110,IDV_0002_SVT3GNES12_FULLDIE_SOC_TALL_0650_MED@132110,IDV_0002_SVT3GNES12_FULLDIE_SOC_TALL_0950_MED@132110,IDV_0003_SVT3GSEM12_FULLDIE_SOC_0650_MED@132110,IDV_0003_SVT3GSEM12_FULLDIE_SOC_0950_MED@132110,...,HVQK_VMIN_SHIFT_SCAN_UNCORE_LLCSFP02@132110,HVQK_VMIN_SHIFT_SCAN_UNCORE_LLCSFP11@132110,HVQK_VMIN_SHIFT_SCAN_UNCORE_LLCSFP12@132110,HVQK_VMIN_SHIFT_SCAN_UNCORE_LLCSFP21@132110,HVQK_VMIN_SHIFT_SCAN_UNCORE_LLCSFP23@132110,HVQK_VMIN_SHIFT_SCAN_UNCORE_MISC@132110,HVQK_VMIN_SHIFT_SCAN_UNCORE_PCIE@132110,HVQK_VMIN_SHIFT_SCAN_UNCORE_RLINK@132110,HVQK_VMIN_SHIFT_SCAN_UNCORE_SA@132110,HVQK_VMIN_SHIFT_SCAN_UNCORE_VNN@132110
0,6016.9,12304.2,5791.55,12112.7,5994.37,12293.0,5887.32,12214.1,5245.07,11600.0,...,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,6073.24,12214.1,6005.63,12163.4,6078.87,12174.7,6061.97,12236.6,5295.77,11560.6,...,0.0,0.02,-555.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0
2,6343.66,12231.0,6163.38,12028.2,6264.79,12152.1,6219.72,12129.6,5695.77,11639.4,...,0.0,0.0,0.0,-555.0,0.0,0.02,0.0,0.0,0.0,0.0
3,5566.2,11453.5,5442.25,11357.8,5554.93,11414.1,5476.06,11391.5,4811.27,10777.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,6247.89,12293.0,6112.68,12180.3,6219.72,12270.4,6163.38,12259.2,5470.42,11639.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [67]:
#To create dataframe - drop columns with IDV token family columns
df_to_convert = df.drop([col for col in df if col.startswith('IDV')], axis=1)

In [68]:
#To create dataframe - drop columns with HVQK token family columns
df_to_convert = df_to_convert.drop([col for col in df if col.startswith('HVQK')], axis=1)

In [69]:
df_to_convert.head()

Unnamed: 0,SPECKLE,TPI_SIU_STATIC::SIU_STATIC_AM_K_START_X_X_X_SNR_VNOM_VCCINF_1P05_SIUP_SICC_VCCADLLOPI_V1@132110,TPI_SIU_STATIC::SIU_STATIC_AM_K_START_X_X_X_SNR_VNOM_VCCINF_1P05_SIUP_SICC_VCCADLLOPI_V1@132150,TPI_SIU_STATIC::SIU_STATIC_AM_K_START_X_X_X_SNR_VNOM_VCCINF_1P05_SIUP_SICC_VCCAGSH_CLM_EHV_V1@132110,TPI_SIU_STATIC::SIU_STATIC_AM_K_START_X_X_X_SNR_VNOM_VCCINF_1P05_SIUP_SICC_VCCAGSH_CLM_EHV_V1@132150,TPI_SIU_STATIC::SIU_STATIC_AM_K_START_X_X_X_SNR_VNOM_VCCINF_1P05_SIUP_SICC_VCCAOPI_V1@132110,TPI_SIU_STATIC::SIU_STATIC_AM_K_START_X_X_X_SNR_VNOM_VCCINF_1P05_SIUP_SICC_VCCAOPI_V1@132150,TPI_SIU_STATIC::SIU_STATIC_AM_K_START_X_X_X_SNR_VNOM_VCCINF_1P05_SIUP_SICC_VCCAPLLOPI_FAD_DIGICKSI0_V1@132110,TPI_SIU_STATIC::SIU_STATIC_AM_K_START_X_X_X_SNR_VNOM_VCCINF_1P05_SIUP_SICC_VCCAPLLOPI_FAD_DIGICKSI0_V1@132150,TPI_SIU_STATIC::SIU_STATIC_AM_K_START_X_X_X_SNR_VNOM_VCCINF_1P05_SIUP_SICC_VCCCORE_M01_V1@132110,...,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDCLK1,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDDATA0,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDDATA1,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TAP_DIS_STRAP_N,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TAP_ODT_EN,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TXT_AGENT,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TXT_PLTEN,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VPPSMBUSSCL,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VPPSMBUSSDA,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VSENSEPMAX
0,1,0.044625,0.041133,0.020178,0.021403,0.013161,0.050175,0.002702,0.013882,0.062004,...,-8.21e-07,-8.21e-07,-8.33e-07,-2.1e-07,-2.24e-07,-1.95e-07,-1.91e-07,-8.99e-07,-8.84e-07,-1.97e-07
1,1,0.0375,0.054445,0.020281,0.021683,0.019436,0.092677,0.002525,0.013907,0.080011,...,-8.67e-07,-8.87e-07,-8.74e-07,-2.11e-07,-2.1e-07,-1.9e-07,-2.28e-07,-8.62e-07,-8.61e-07,-1.92e-07
2,1,0.047672,0.049816,0.021016,0.022506,0.013063,0.060671,0.007954,0.013618,0.076174,...,-6e-07,-6.5e-07,-6.38e-07,-1.44e-07,-1.38e-07,-1.44e-07,-1.62e-07,-5.82e-07,-6.15e-07,-1.46e-07
3,1,0.047462,0.049402,0.018823,0.019772,0.010239,0.043242,0.001651,0.007795,0.031176,...,-1.17e-06,-1.22e-06,-1.23e-06,-2.95e-07,-2.93e-07,-3.19e-07,-3.07e-07,-1.22e-06,-1.19e-06,-2.96e-07
4,1,0.048798,0.05056,0.019342,0.021034,0.013236,0.066157,0.002411,0.012175,0.048849,...,-8.52e-07,-8.57e-07,-8.64e-07,-2.15e-07,-2.25e-07,-2.19e-07,-2.17e-07,-8.545e-07,-8.51e-07,-2.13e-07


In [70]:
#Convert negative columns with specific token family prefix
df_positive = df_to_convert.abs()

In [71]:
df_positive.head()

Unnamed: 0,SPECKLE,TPI_SIU_STATIC::SIU_STATIC_AM_K_START_X_X_X_SNR_VNOM_VCCINF_1P05_SIUP_SICC_VCCADLLOPI_V1@132110,TPI_SIU_STATIC::SIU_STATIC_AM_K_START_X_X_X_SNR_VNOM_VCCINF_1P05_SIUP_SICC_VCCADLLOPI_V1@132150,TPI_SIU_STATIC::SIU_STATIC_AM_K_START_X_X_X_SNR_VNOM_VCCINF_1P05_SIUP_SICC_VCCAGSH_CLM_EHV_V1@132110,TPI_SIU_STATIC::SIU_STATIC_AM_K_START_X_X_X_SNR_VNOM_VCCINF_1P05_SIUP_SICC_VCCAGSH_CLM_EHV_V1@132150,TPI_SIU_STATIC::SIU_STATIC_AM_K_START_X_X_X_SNR_VNOM_VCCINF_1P05_SIUP_SICC_VCCAOPI_V1@132110,TPI_SIU_STATIC::SIU_STATIC_AM_K_START_X_X_X_SNR_VNOM_VCCINF_1P05_SIUP_SICC_VCCAOPI_V1@132150,TPI_SIU_STATIC::SIU_STATIC_AM_K_START_X_X_X_SNR_VNOM_VCCINF_1P05_SIUP_SICC_VCCAPLLOPI_FAD_DIGICKSI0_V1@132110,TPI_SIU_STATIC::SIU_STATIC_AM_K_START_X_X_X_SNR_VNOM_VCCINF_1P05_SIUP_SICC_VCCAPLLOPI_FAD_DIGICKSI0_V1@132150,TPI_SIU_STATIC::SIU_STATIC_AM_K_START_X_X_X_SNR_VNOM_VCCINF_1P05_SIUP_SICC_VCCCORE_M01_V1@132110,...,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDCLK1,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDDATA0,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDDATA1,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TAP_DIS_STRAP_N,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TAP_ODT_EN,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TXT_AGENT,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TXT_PLTEN,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VPPSMBUSSCL,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VPPSMBUSSDA,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VSENSEPMAX
0,1,0.044625,0.041133,0.020178,0.021403,0.013161,0.050175,0.002702,0.013882,0.062004,...,8.21e-07,8.21e-07,8.33e-07,2.1e-07,2.24e-07,1.95e-07,1.91e-07,8.99e-07,8.84e-07,1.97e-07
1,1,0.0375,0.054445,0.020281,0.021683,0.019436,0.092677,0.002525,0.013907,0.080011,...,8.67e-07,8.87e-07,8.74e-07,2.11e-07,2.1e-07,1.9e-07,2.28e-07,8.62e-07,8.61e-07,1.92e-07
2,1,0.047672,0.049816,0.021016,0.022506,0.013063,0.060671,0.007954,0.013618,0.076174,...,6e-07,6.5e-07,6.38e-07,1.44e-07,1.38e-07,1.44e-07,1.62e-07,5.82e-07,6.15e-07,1.46e-07
3,1,0.047462,0.049402,0.018823,0.019772,0.010239,0.043242,0.001651,0.007795,0.031176,...,1.17e-06,1.22e-06,1.23e-06,2.95e-07,2.93e-07,3.19e-07,3.07e-07,1.22e-06,1.19e-06,2.96e-07
4,1,0.048798,0.05056,0.019342,0.021034,0.013236,0.066157,0.002411,0.012175,0.048849,...,8.52e-07,8.57e-07,8.64e-07,2.15e-07,2.25e-07,2.19e-07,2.17e-07,8.545e-07,8.51e-07,2.13e-07


In [72]:
#To create dataset with some parameters converted to absolute values
#Concatenating df_keep and df_positive along columns
df = pd.concat([df_keep, df_positive], axis=1)

In [73]:
df.head()

Unnamed: 0,IDV_0001_SVT3GNES12_FULLDIE_SOC_0650_MED@132110,IDV_0001_SVT3GNES12_FULLDIE_SOC_0950_MED@132110,IDV_0001_SVT3GNES12_FULLDIE_SOC_TALL_0650_MED@132110,IDV_0001_SVT3GNES12_FULLDIE_SOC_TALL_0950_MED@132110,IDV_0002_SVT3GNES12_FULLDIE_SOC_0650_MED@132110,IDV_0002_SVT3GNES12_FULLDIE_SOC_0950_MED@132110,IDV_0002_SVT3GNES12_FULLDIE_SOC_TALL_0650_MED@132110,IDV_0002_SVT3GNES12_FULLDIE_SOC_TALL_0950_MED@132110,IDV_0003_SVT3GSEM12_FULLDIE_SOC_0650_MED@132110,IDV_0003_SVT3GSEM12_FULLDIE_SOC_0950_MED@132110,...,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDCLK1,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDDATA0,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDDATA1,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TAP_DIS_STRAP_N,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TAP_ODT_EN,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TXT_AGENT,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TXT_PLTEN,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VPPSMBUSSCL,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VPPSMBUSSDA,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VSENSEPMAX
0,6016.9,12304.2,5791.55,12112.7,5994.37,12293.0,5887.32,12214.1,5245.07,11600.0,...,8.21e-07,8.21e-07,8.33e-07,2.1e-07,2.24e-07,1.95e-07,1.91e-07,8.99e-07,8.84e-07,1.97e-07
1,6073.24,12214.1,6005.63,12163.4,6078.87,12174.7,6061.97,12236.6,5295.77,11560.6,...,8.67e-07,8.87e-07,8.74e-07,2.11e-07,2.1e-07,1.9e-07,2.28e-07,8.62e-07,8.61e-07,1.92e-07
2,6343.66,12231.0,6163.38,12028.2,6264.79,12152.1,6219.72,12129.6,5695.77,11639.4,...,6e-07,6.5e-07,6.38e-07,1.44e-07,1.38e-07,1.44e-07,1.62e-07,5.82e-07,6.15e-07,1.46e-07
3,5566.2,11453.5,5442.25,11357.8,5554.93,11414.1,5476.06,11391.5,4811.27,10777.5,...,1.17e-06,1.22e-06,1.23e-06,2.95e-07,2.93e-07,3.19e-07,3.07e-07,1.22e-06,1.19e-06,2.96e-07
4,6247.89,12293.0,6112.68,12180.3,6219.72,12270.4,6163.38,12259.2,5470.42,11639.4,...,8.52e-07,8.57e-07,8.64e-07,2.15e-07,2.25e-07,2.19e-07,2.17e-07,8.545e-07,8.51e-07,2.13e-07


### 5) Handling unary columns 

In [42]:
def unary(df):
    """
    Checks for unary columns. If there are unary columns, the unary columns will be printed and removed from the df
    df[DataFrame]: input dataframe
    """
    unarycolumns = [col for col in df.columns if len(df[col].unique())==1]
    if unarycolumns:
        print("The unary column are:",unarycolumns)
        df = df.drop(unarycolumns,axis=1)
        print("Unary columns dropped!")
        return df 
    else:
        print("There are no unary columns!")

In [43]:
unary(df)

There are no unary columns!


In [None]:
# supporting_fs = ['SPECKLE','VID','ULT@MIDAS_6261_U1', 'SORTLOT', 'SORTLOT7', 'WAFER',
#        'XLOC', 'YLOC', 'ULT@MIDAS_6261_U2', 'IB@6261[CLASSHOT]',
#        'FB@6261[CLASSHOT]','ARR_SPECKLE_SCREEN::PHYTON_X_USERFUNC_K_END_X_X_X_X_PBISTSPECKLE_INDICATOR::DUTCPUBAD_1@6261[CLASSHOT]',
#        'TEST RESULTS', 'TEST RESULTS BITS', 'TR_BITS', 'INCOMING',
#        'INCOMING BITS', 'INC_BITS', 'OUTGOING', 'OUTGOING BITS', 'OUT_BITS','DELTA']
# sf_df = df[supporting_fs]
# df1 = df.drop(supporting_fs,axis=1) 
# df1.head()

In [None]:
# df1.columns.difference(df2.columns)

In [None]:
# df1['PTH_POWER::POWER_X_SCREEN_E_BEGIN_X_X_X_X_CALC_PP_CDYN_INDICATOR_CDYN_DATA@132110']

In [None]:
# df2 = df1.drop(['PTH_POWER::POWER_X_SCREEN_E_BEGIN_X_X_X_X_CALC_PP_CDYN_INDICATOR_CDYN_DATA@132110'],axis=1)
# df_neg = df2.loc[:,((df2 == -555).any()) | ((df2 == -999).any())]

In [None]:
# df_neg_sf = pd.concat([sf_df,df_neg],axis=1)
# df_neg_sf

In [None]:
# df_neg_sf.to_csv(path+"/DataPreparation/Negative_Cols_555_999.csv",index=False)

### 6) Train-validation split

In [44]:
from sklearn.model_selection import train_test_split
from collections import Counter
def randomsamp(df,val_size):
    """
    Split whole dataset into train and validation using random sampling
    Returns X_train, X_val, y_train, y_val
    df[DataFrame]: input dataframe    
    val_size[float]:Should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the val split. 
                    Train size is complement of val size
    """
    X= df.drop(["SPECKLE"],axis=1)
    y= df["SPECKLE"] 
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = val_size,random_state=42,stratify= df["SPECKLE"])
    if type(val_size)==float:
        print("Train-val split completed with",(1-val_size)*100,"-",val_size*100,"split in train-val")
    print("Shape of X_train is:", X_train.shape)
    print("Shape of X_val is:",X_val.shape)
    print("Shape of y_train is:",y_train.shape)
    print("Shape of y_val is:",y_val.shape)
    print("Distribution of y_train:",Counter(y_train))
    print("Distribution of y_val:",Counter(y_val))
    
    X_train = X_train.reset_index(drop=True)
    X_val = X_val.reset_index(drop=True)
    y_train = y_train.reset_index(drop=True)
    y_val = y_val.reset_index(drop=True)        
           
    return X_train, X_val, y_train, y_val

def targetrandomsamp(df,speckle_test_size,nonspeckle_test_size):
    """
    Dataset is split into speckle/non-speckle first. The speckle/non-speckle datasets are then split 
    into train and validation using random sampling, followed by merging of the speckle/non-speckle  
    to return X_train, X_val, y_train, y_val
    
    df[DataFrame]: input dataframe  
    
    speckle_test_size[float/int]: Val size for speckle 
    -If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the val split. 
    -If int, represents the absolute number of val samples.
    -Train size is complement of val size
    
    nonspeckle_test_size[float/int]: Val size for non-speckle 
    -If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the val split. 
    -If int, represents the absolute number of val samples.
    -Train size is complement of val size
    
    """
    #split data into speckle/nonspeckle
    df_s = df.loc[df["SPECKLE"] == 1] #speckle
    df_ns = df.loc[df["SPECKLE"] == 0] #nonspeckle
    
    #split speckle/nonspeckle data into train/validation
    print("For speckle data:")
    X_train_s, X_val_s, y_train_s, y_val_s = randomsamp(df_s,val_size=speckle_test_size)
    print("\nFor non-speckle data:")
    X_train_ns, X_val_ns, y_train_ns, y_val_ns = randomsamp(df_ns,val_size=nonspeckle_test_size)
    
    #concat the speckle/non-speckle train and validation
    X_train = pd.concat([X_train_s,X_train_ns], ignore_index=True)
    X_val = pd.concat([X_val_s,X_val_ns], ignore_index=True)
    y_train = pd.concat([y_train_s,y_train_ns], ignore_index=True)
    y_val = pd.concat([y_val_s,y_val_ns], ignore_index=True)
    
    X_train = X_train.reset_index(drop=True)
    X_val = X_val.reset_index(drop=True)
    y_train = y_train.reset_index(drop=True)
    y_val = y_val.reset_index(drop=True)  
    
    print("\nFinal dataset:")
    print("Distribution of y_train:",Counter(y_train))
    print("Distribution of y_val:",Counter(y_val))
    return X_train, X_val, y_train, y_val

In [45]:
df["SPECKLE"].value_counts()

0    4608
1    1769
Name: SPECKLE, dtype: int64

In [46]:
#train-test split on whole data
X_train, X_val, y_train, y_val = randomsamp(df,val_size=0.3)

Train-val split completed with 70.0 - 30.0 split in train-val
Shape of X_train is: (4463, 1592)
Shape of X_val is: (1914, 1592)
Shape of y_train is: (4463,)
Shape of y_val is: (1914,)
Distribution of y_train: Counter({0: 3225, 1: 1238})
Distribution of y_val: Counter({0: 1383, 1: 531})


In [47]:
#train-test split on speckle/non-speckle then merge
X_train, X_val, y_train, y_val = targetrandomsamp(df,speckle_test_size=500,nonspeckle_test_size=2000)

For speckle data:
Shape of X_train is: (1269, 1592)
Shape of X_val is: (500, 1592)
Shape of y_train is: (1269,)
Shape of y_val is: (500,)
Distribution of y_train: Counter({1: 1269})
Distribution of y_val: Counter({1: 500})

For non-speckle data:
Shape of X_train is: (2608, 1592)
Shape of X_val is: (2000, 1592)
Shape of y_train is: (2608,)
Shape of y_val is: (2000,)
Distribution of y_train: Counter({0: 2608})
Distribution of y_val: Counter({0: 2000})

Final dataset:
Distribution of y_train: Counter({0: 2608, 1: 1269})
Distribution of y_val: Counter({0: 2000, 1: 500})


### 7) Scaling

In [48]:
#To scale X train and X validation (test) data using various scalers
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

def scale_data(X_train,X_val,scaler_type):
    """
    Scaling X train and validation with standardization or normalization
    params:
    X_train[DataFrame]: input X train
    X_val[DataFrame]: input X validation (test)
    scaler_type[None/string]: input scaling method
    - "Standardization" for Standard Scaler
    - "Normalization" for Min Max Scaler    

    """           
    if scaler_type == "Standardization":
        scaler = StandardScaler()
    if scaler_type == "Normalization":
        scaler = MinMaxScaler()

    X_train_scaled = scaler.fit_transform(X_train)
    X_train_scaled = pd.DataFrame(X_train_scaled,columns= X_train.columns)
    X_val_scaled = scaler.transform(X_val)
    X_val_scaled = pd.DataFrame(X_val_scaled,columns= X_val.columns)
       
    return X_train_scaled, X_val_scaled

In [49]:
#X train-test scaled using Standard Scaler
X_train_scaled, X_val_scaled = scale_data(X_train,X_val,scaler_type = "Standardization")

In [50]:
#X train-test scaled using Standard Scaler
X_train_scaled, X_val_scaled = scale_data(X_train,X_val,scaler_type = "Normalization")

In [51]:
X_train_scaled

Unnamed: 0,IDV_0001_SVT3GNES12_FULLDIE_SOC_0650_MED@132110,IDV_0001_SVT3GNES12_FULLDIE_SOC_0950_MED@132110,IDV_0001_SVT3GNES12_FULLDIE_SOC_TALL_0650_MED@132110,IDV_0001_SVT3GNES12_FULLDIE_SOC_TALL_0950_MED@132110,IDV_0002_SVT3GNES12_FULLDIE_SOC_0650_MED@132110,IDV_0002_SVT3GNES12_FULLDIE_SOC_0950_MED@132110,IDV_0002_SVT3GNES12_FULLDIE_SOC_TALL_0650_MED@132110,IDV_0002_SVT3GNES12_FULLDIE_SOC_TALL_0950_MED@132110,IDV_0003_SVT3GSEM12_FULLDIE_SOC_0650_MED@132110,IDV_0003_SVT3GSEM12_FULLDIE_SOC_0950_MED@132110,...,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDCLK1,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDDATA0,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDDATA1,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TAP_DIS_STRAP_N,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TAP_ODT_EN,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TXT_AGENT,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TXT_PLTEN,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VPPSMBUSSCL,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VPPSMBUSSDA,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VSENSEPMAX
0,0.539722,0.562830,0.536038,0.471447,0.538460,0.521612,0.542725,0.479134,0.554139,0.539381,...,0.617149,0.629761,0.607012,0.622703,0.657123,0.666161,0.597625,0.601952,0.589827,0.639862
1,0.656545,0.669383,0.644145,0.571407,0.651582,0.616707,0.669747,0.604173,0.690019,0.699700,...,0.546150,0.558659,0.549451,0.518919,0.629527,0.609224,0.611252,0.563991,0.579004,0.546909
2,0.432243,0.469955,0.441443,0.362866,0.454750,0.435163,0.443419,0.363074,0.452229,0.437332,...,0.638995,0.678517,0.633176,0.668108,0.681269,0.671854,0.628772,0.657267,0.660173,0.559879
3,0.572431,0.543722,0.563062,0.434273,0.556559,0.504322,0.568129,0.440465,0.571124,0.510195,...,0.734025,0.727273,0.710623,0.711351,0.776130,0.732587,0.663812,0.713124,0.726190,0.726329
4,0.441591,0.576507,0.436938,0.480018,0.427602,0.512967,0.445727,0.494031,0.456473,0.542279,...,0.513381,0.528187,0.497122,0.451892,0.620904,0.556083,0.556745,0.553145,0.568182,0.497190
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3872,0.691589,0.726757,0.673424,0.625723,0.710406,0.717581,0.667439,0.624987,0.726115,0.772614,...,0.734025,0.743017,0.739403,0.750270,0.779579,0.774340,0.726105,0.733731,0.738636,0.756593
3873,0.581775,0.530045,0.567568,0.334314,0.567872,0.481252,0.570441,0.336292,0.698514,0.545229,...,0.967231,0.947689,0.953951,0.985297,0.929631,0.966028,0.949192,0.957158,0.975108,0.948984
3874,0.710280,0.620205,0.711715,0.559996,0.717193,0.608062,0.704390,0.553566,0.690019,0.574363,...,0.843801,0.826816,0.833072,0.808649,0.846844,0.844563,0.805918,0.805315,0.849567,0.832252
3875,0.574766,0.639362,0.558561,0.519982,0.570137,0.579262,0.579676,0.535711,0.577493,0.580211,...,0.659749,0.678517,0.633176,0.633514,0.662297,0.654773,0.624878,0.650759,0.688312,0.622568


In [52]:
X_val_scaled

Unnamed: 0,IDV_0001_SVT3GNES12_FULLDIE_SOC_0650_MED@132110,IDV_0001_SVT3GNES12_FULLDIE_SOC_0950_MED@132110,IDV_0001_SVT3GNES12_FULLDIE_SOC_TALL_0650_MED@132110,IDV_0001_SVT3GNES12_FULLDIE_SOC_TALL_0950_MED@132110,IDV_0002_SVT3GNES12_FULLDIE_SOC_0650_MED@132110,IDV_0002_SVT3GNES12_FULLDIE_SOC_0950_MED@132110,IDV_0002_SVT3GNES12_FULLDIE_SOC_TALL_0650_MED@132110,IDV_0002_SVT3GNES12_FULLDIE_SOC_TALL_0950_MED@132110,IDV_0003_SVT3GSEM12_FULLDIE_SOC_0650_MED@132110,IDV_0003_SVT3GSEM12_FULLDIE_SOC_0950_MED@132110,...,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDCLK1,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDDATA0,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDDATA1,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TAP_DIS_STRAP_N,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TAP_ODT_EN,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TXT_AGENT,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TXT_PLTEN,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VPPSMBUSSCL,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VPPSMBUSSDA,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VSENSEPMAX
0,0.651871,0.734953,0.608109,0.617152,0.622170,0.685866,0.618940,0.624987,0.639064,0.708497,...,0.922447,0.911630,0.899006,0.882162,0.914108,0.928070,0.870158,0.891540,0.918831,0.888457
1,0.612149,0.666667,0.572073,0.542854,0.619909,0.636912,0.591224,0.565452,0.558387,0.609346,...,0.687602,0.720163,0.685505,0.611892,0.701966,0.681344,0.696905,0.700108,0.697511,0.683096
2,0.539722,0.562830,0.536038,0.471447,0.538460,0.521612,0.542725,0.479134,0.554139,0.539381,...,0.734571,0.751651,0.714809,0.748108,0.776130,0.753464,0.726105,0.729393,0.724567,0.756593
3,0.446260,0.469955,0.441443,0.377168,0.450228,0.423653,0.448035,0.369044,0.380042,0.393604,...,0.600765,0.589132,0.596546,0.544865,0.627803,0.616815,0.580105,0.569414,0.611472,0.607436
4,0.427569,0.590184,0.423425,0.494269,0.436650,0.541767,0.434179,0.505969,0.426750,0.539381,...,0.578919,0.594210,0.570382,0.620541,0.629527,0.603530,0.597625,0.580260,0.611472,0.613921
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2495,0.586449,0.519133,0.576579,0.428593,0.570137,0.472607,0.586604,0.434496,0.605094,0.495653,...,0.838340,0.832402,0.826269,0.804324,0.853743,0.865439,0.774771,0.826464,0.839827,0.791180
2496,0.317755,0.469955,0.322074,0.371437,0.314480,0.391938,0.323326,0.375013,0.295114,0.396502,...,0.453304,0.477400,0.423862,0.408649,0.531218,0.516227,0.521705,0.450108,0.465368,0.464764
2497,0.796729,0.874339,0.788287,0.794300,0.803168,0.867461,0.789840,0.797623,0.802545,0.898002,...,0.767886,0.767902,0.751439,0.784865,0.758882,0.781932,0.796185,0.757592,0.781926,0.709036
2498,0.364486,0.393424,0.369372,0.279998,0.373304,0.351578,0.371825,0.291653,0.392780,0.349876,...,0.606226,0.634840,0.612245,0.605405,0.657123,0.669956,0.589838,0.664317,0.668290,0.618245


### 8) Highly correlated features

In [54]:
#To create correlation matrix for X_train (features)
correlation_matrix = X_train.corr().abs()
display(correlation_matrix)

Unnamed: 0,IDV_0001_SVT3GNES12_FULLDIE_SOC_0650_MED@132110,IDV_0001_SVT3GNES12_FULLDIE_SOC_0950_MED@132110,IDV_0001_SVT3GNES12_FULLDIE_SOC_TALL_0650_MED@132110,IDV_0001_SVT3GNES12_FULLDIE_SOC_TALL_0950_MED@132110,IDV_0002_SVT3GNES12_FULLDIE_SOC_0650_MED@132110,IDV_0002_SVT3GNES12_FULLDIE_SOC_0950_MED@132110,IDV_0002_SVT3GNES12_FULLDIE_SOC_TALL_0650_MED@132110,IDV_0002_SVT3GNES12_FULLDIE_SOC_TALL_0950_MED@132110,IDV_0003_SVT3GSEM12_FULLDIE_SOC_0650_MED@132110,IDV_0003_SVT3GSEM12_FULLDIE_SOC_0950_MED@132110,...,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDCLK1,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDDATA0,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDDATA1,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TAP_DIS_STRAP_N,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TAP_ODT_EN,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TXT_AGENT,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TXT_PLTEN,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VPPSMBUSSCL,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VPPSMBUSSDA,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VSENSEPMAX
IDV_0001_SVT3GNES12_FULLDIE_SOC_0650_MED@132110,1.000000,0.858414,0.986821,0.849228,0.993756,0.854063,0.987386,0.849588,0.972810,0.873460,...,0.373184,0.374451,0.370373,0.377584,0.362125,0.368922,0.392636,0.383256,0.381629,0.359046
IDV_0001_SVT3GNES12_FULLDIE_SOC_0950_MED@132110,0.858414,1.000000,0.832407,0.975542,0.851902,0.990674,0.833389,0.976864,0.786876,0.979779,...,0.183277,0.184588,0.179200,0.206905,0.182987,0.192327,0.215912,0.194251,0.194457,0.181164
IDV_0001_SVT3GNES12_FULLDIE_SOC_TALL_0650_MED@132110,0.986821,0.832407,1.000000,0.852482,0.987806,0.834816,0.997611,0.849930,0.972040,0.856047,...,0.394148,0.395065,0.390792,0.397622,0.381495,0.389994,0.412221,0.405942,0.404148,0.380616
IDV_0001_SVT3GNES12_FULLDIE_SOC_TALL_0950_MED@132110,0.849228,0.975542,0.852482,1.000000,0.850099,0.977043,0.850495,0.995941,0.785327,0.964348,...,0.182832,0.184187,0.178142,0.205829,0.180734,0.193365,0.215781,0.195942,0.195946,0.181885
IDV_0002_SVT3GNES12_FULLDIE_SOC_0650_MED@132110,0.993756,0.851902,0.987806,0.850099,1.000000,0.859126,0.987459,0.848895,0.973928,0.872777,...,0.377494,0.378817,0.374250,0.381609,0.366559,0.372907,0.396731,0.387166,0.385712,0.363809
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TXT_AGENT,0.368922,0.192327,0.389994,0.193365,0.372907,0.196504,0.382524,0.189915,0.434334,0.232863,...,0.965415,0.963168,0.964588,0.946489,0.948090,1.000000,0.941410,0.958051,0.959755,0.937245
SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TXT_PLTEN,0.392636,0.215912,0.412221,0.215781,0.396731,0.221083,0.405493,0.212930,0.457787,0.259285,...,0.958308,0.955386,0.960003,0.946396,0.947058,0.941410,1.000000,0.958684,0.959475,0.932744
SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VPPSMBUSSCL,0.383256,0.194251,0.405942,0.195942,0.387166,0.198413,0.398783,0.192899,0.451673,0.238484,...,0.983792,0.983835,0.984263,0.959330,0.966945,0.958051,0.958684,1.000000,0.992183,0.956843
SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VPPSMBUSSDA,0.381629,0.194457,0.404148,0.195946,0.385712,0.198641,0.396970,0.192878,0.449316,0.237668,...,0.984994,0.984817,0.985141,0.958901,0.969331,0.959755,0.959475,0.992183,1.000000,0.958719


In [56]:
#To select the upper trigular matrix from the correlation matrix of features
import numpy as np
upper_tri = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape),k=1).astype(np.bool))
display(upper_tri)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  upper_tri = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape),k=1).astype(np.bool))


Unnamed: 0,IDV_0001_SVT3GNES12_FULLDIE_SOC_0650_MED@132110,IDV_0001_SVT3GNES12_FULLDIE_SOC_0950_MED@132110,IDV_0001_SVT3GNES12_FULLDIE_SOC_TALL_0650_MED@132110,IDV_0001_SVT3GNES12_FULLDIE_SOC_TALL_0950_MED@132110,IDV_0002_SVT3GNES12_FULLDIE_SOC_0650_MED@132110,IDV_0002_SVT3GNES12_FULLDIE_SOC_0950_MED@132110,IDV_0002_SVT3GNES12_FULLDIE_SOC_TALL_0650_MED@132110,IDV_0002_SVT3GNES12_FULLDIE_SOC_TALL_0950_MED@132110,IDV_0003_SVT3GSEM12_FULLDIE_SOC_0650_MED@132110,IDV_0003_SVT3GSEM12_FULLDIE_SOC_0950_MED@132110,...,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDCLK1,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDDATA0,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDDATA1,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TAP_DIS_STRAP_N,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TAP_ODT_EN,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TXT_AGENT,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TXT_PLTEN,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VPPSMBUSSCL,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VPPSMBUSSDA,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VSENSEPMAX
IDV_0001_SVT3GNES12_FULLDIE_SOC_0650_MED@132110,,0.858414,0.986821,0.849228,0.993756,0.854063,0.987386,0.849588,0.972810,0.873460,...,0.373184,0.374451,0.370373,0.377584,0.362125,0.368922,0.392636,0.383256,0.381629,0.359046
IDV_0001_SVT3GNES12_FULLDIE_SOC_0950_MED@132110,,,0.832407,0.975542,0.851902,0.990674,0.833389,0.976864,0.786876,0.979779,...,0.183277,0.184588,0.179200,0.206905,0.182987,0.192327,0.215912,0.194251,0.194457,0.181164
IDV_0001_SVT3GNES12_FULLDIE_SOC_TALL_0650_MED@132110,,,,0.852482,0.987806,0.834816,0.997611,0.849930,0.972040,0.856047,...,0.394148,0.395065,0.390792,0.397622,0.381495,0.389994,0.412221,0.405942,0.404148,0.380616
IDV_0001_SVT3GNES12_FULLDIE_SOC_TALL_0950_MED@132110,,,,,0.850099,0.977043,0.850495,0.995941,0.785327,0.964348,...,0.182832,0.184187,0.178142,0.205829,0.180734,0.193365,0.215781,0.195942,0.195946,0.181885
IDV_0002_SVT3GNES12_FULLDIE_SOC_0650_MED@132110,,,,,,0.859126,0.987459,0.848895,0.973928,0.872777,...,0.377494,0.378817,0.374250,0.381609,0.366559,0.372907,0.396731,0.387166,0.385712,0.363809
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TXT_AGENT,,,,,,,,,,,...,,,,,,,0.941410,0.958051,0.959755,0.937245
SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TXT_PLTEN,,,,,,,,,,,...,,,,,,,,0.958684,0.959475,0.932744
SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VPPSMBUSSCL,,,,,,,,,,,...,,,,,,,,,0.992183,0.956843
SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VPPSMBUSSDA,,,,,,,,,,,...,,,,,,,,,,0.958719


In [57]:
#To display columns with absolute correlation > 0.95
highly_corr_features_95 = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]
display(); display(highly_corr_features_95)

['IDV_0001_SVT3GNES12_FULLDIE_SOC_TALL_0650_MED@132110',
 'IDV_0001_SVT3GNES12_FULLDIE_SOC_TALL_0950_MED@132110',
 'IDV_0002_SVT3GNES12_FULLDIE_SOC_0650_MED@132110',
 'IDV_0002_SVT3GNES12_FULLDIE_SOC_0950_MED@132110',
 'IDV_0002_SVT3GNES12_FULLDIE_SOC_TALL_0650_MED@132110',
 'IDV_0002_SVT3GNES12_FULLDIE_SOC_TALL_0950_MED@132110',
 'IDV_0003_SVT3GSEM12_FULLDIE_SOC_0650_MED@132110',
 'IDV_0003_SVT3GSEM12_FULLDIE_SOC_0950_MED@132110',
 'IDV_0003_SVT3GSEM12_FULLDIE_SOC_TALL_0650_MED@132110',
 'IDV_0003_SVT3GSEM12_FULLDIE_SOC_TALL_0950_MED@132110',
 'IDV_0004_NOM3GNES12_FULLDIE_SOC_TALL_0650_MED@132110',
 'IDV_0004_NOM3GNES12_FULLDIE_SOC_TALL_0950_MED@132110',
 'IDV_0005_NOM3GNES12_FULLDIE_SOC_0650_MED@132110',
 'IDV_0005_NOM3GNES12_FULLDIE_SOC_0950_MED@132110',
 'IDV_0005_NOM3GNES12_FULLDIE_SOC_TALL_0650_MED@132110',
 'IDV_0005_NOM3GNES12_FULLDIE_SOC_TALL_0950_MED@132110',
 'IDV_0006_SVT3GISO12_FULLDIE_SOC_0650_MED@132110',
 'IDV_0006_SVT3GISO12_FULLDIE_SOC_TALL_0650_MED@132110',
 'IDV_000

In [62]:
#To display columns with absolute correlation > 0.90
highly_corr_features_90 = [column for column in upper_tri.columns if any(upper_tri[column] > 0.9)]
display(); display(highly_corr_features_90)

['IDV_0001_SVT3GNES12_FULLDIE_SOC_TALL_0650_MED@132110',
 'IDV_0001_SVT3GNES12_FULLDIE_SOC_TALL_0950_MED@132110',
 'IDV_0002_SVT3GNES12_FULLDIE_SOC_0650_MED@132110',
 'IDV_0002_SVT3GNES12_FULLDIE_SOC_0950_MED@132110',
 'IDV_0002_SVT3GNES12_FULLDIE_SOC_TALL_0650_MED@132110',
 'IDV_0002_SVT3GNES12_FULLDIE_SOC_TALL_0950_MED@132110',
 'IDV_0003_SVT3GSEM12_FULLDIE_SOC_0650_MED@132110',
 'IDV_0003_SVT3GSEM12_FULLDIE_SOC_0950_MED@132110',
 'IDV_0003_SVT3GSEM12_FULLDIE_SOC_TALL_0650_MED@132110',
 'IDV_0003_SVT3GSEM12_FULLDIE_SOC_TALL_0950_MED@132110',
 'IDV_0004_NOM3GNES12_FULLDIE_SOC_0950_MED@132110',
 'IDV_0004_NOM3GNES12_FULLDIE_SOC_TALL_0650_MED@132110',
 'IDV_0004_NOM3GNES12_FULLDIE_SOC_TALL_0950_MED@132110',
 'IDV_0005_NOM3GNES12_FULLDIE_SOC_0650_MED@132110',
 'IDV_0005_NOM3GNES12_FULLDIE_SOC_0950_MED@132110',
 'IDV_0005_NOM3GNES12_FULLDIE_SOC_TALL_0650_MED@132110',
 'IDV_0005_NOM3GNES12_FULLDIE_SOC_TALL_0950_MED@132110',
 'IDV_0006_SVT3GISO12_FULLDIE_SOC_0650_MED@132110',
 'IDV_0006_SVT

### 2) Sampling to address class imbalance

In [40]:
Counter(y_train)

Counter({0: 3225, 1: 1238})

In [42]:
from imblearn.over_sampling import SMOTE 
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.over_sampling import ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from collections import Counter

#SMOTE
def SMOTE_sampling(X_train_scaled,y_train,over_amt,under_amt=None):
    """
    Can choose SMOTE or SMOTE + random undersampling
    X_train_scaled[DataFrame]: X_train after scaling
    y_train[DataFrame] : target in train data
    over_amt [int] : amount of synthetic data to be generated with SMOTE (speckle)
    under_amt[int/None]: int - amount of nonspeckle data after undersampling
                         None - No undersampling is done
    
    """
    print("Class distribution before sampling:", Counter(y_train))
    print("Ratio of class distribution before sampling :",round(Counter(y_train)[0]/Counter(y_train)[1],2))
    
    if under_amt == None: #SMOTE only
        sm = SMOTE(sampling_strategy= {1:over_amt},random_state=42)
        X_s, y_s = sm.fit_resample(X_train_scaled, y_train)
        
    else: #SMOTE + random undersampling
        sm = SMOTE(sampling_strategy={1:over_amt},random_state=42)
        under = RandomUnderSampler(sampling_strategy={0:under_amt},random_state=42)
        pipeline = Pipeline(steps=[('o', sm), ('u', under)])
        X_s, y_s = pipeline.fit_resample(X_train_scaled, y_train)    
         
    print("Class distribution after sampling:", Counter(y_s))     
    print("Ratio of class distribution after sampling :",round(Counter(y_s)[0]/Counter(y_s)[1],2)) 
    
    return X_s, y_s

#BorderlineSMOTE
def BorderlineSMOTE_sampling(X_train_scaled,y_train,over_amt,under_amt=None):
    """
    Can choose BorderlineSMOTE or BorderlineSMOTE + random undersampling
    X_train_scaled[DataFrame]: X_train after scaling
    y_train[DataFrame] : target in train data
    supporting_fs[list] : list of supporting features to be removed from X_train_scaled
    over_amt [int] : amount of synthetic data to be generated with BorderlineSMOTE
    under_amt[int/None]: int - amount of nonspeckle data after undersampling
                         None - No undersampling is done
    
    """
    print("Class distribution before sampling:", Counter(y_train))
    print("Ratio of class distribution before sampling :",round(Counter(y_train)[0]/Counter(y_train)[1],2))
    
    if under_amt == None: #BorderlineSMOTE only
        bsm = BorderlineSMOTE(sampling_strategy= {1:over_amt},random_state=42)
        X_bs, y_bs = bsm.fit_resample(X_train_scaled, y_train)
        
    else: #BorderlineSMOTE + random undersampling
        bsm = BorderlineSMOTE(sampling_strategy={1:over_amt},random_state=42)
        under = RandomUnderSampler(sampling_strategy={0:under_amt},random_state=42)
        pipeline = Pipeline(steps=[('o', bsm), ('u', under)])
        X_bs, y_bs = pipeline.fit_resample(X_train_scaled, y_train)    
         
    print("Class distribution after sampling:", Counter(y_bs))     
    print("Ratio of class distribution after sampling :",round(Counter(y_bs)[0]/Counter(y_bs)[1],2)) 
    
    return X_bs, y_bs

#ADASYN
def ADASYN_sampling(X_train_scaled,y_train,over_amt,under_amt=None):
    """
    Can choose ADASYN or ADASYN + random undersampling
    X_train_scaled[DataFrame]: X_train after scaling
    y_train[DataFrame] : target in train data
    supporting_fs[list] : list of supporting features to be removed from X_train_scaled
    over_amt [int] : amount of synthetic data to be generated with ADASYN
    under_amt[int/None]: int - amount of nonspeckle data after undersampling
                         None - No undersampling is done
    
    """
    print("Class distribution before sampling:", Counter(y_train))
    print("Ratio of class distribution before sampling :",round(Counter(y_train)[0]/Counter(y_train)[1],2))
    
    if under_amt == None: #ADASYN only
        ad = ADASYN(sampling_strategy= {1:over_amt},random_state=42)
        X_a, y_a = ad.fit_resample(X_train_scaled, y_train)
        
    else: #ADASYN + random undersampling
        ad = ADASYN(sampling_strategy={1:over_amt},random_state=42)
        under = RandomUnderSampler(sampling_strategy={0:under_amt},random_state=42)
        pipeline = Pipeline(steps=[('o', ad), ('u', under)])
        X_a, y_a = pipeline.fit_resample(X_train_scaled, y_train)    
         
    print("Class distribution after sampling:", Counter(y_a))     
    print("Ratio of class distribution after sampling :",round(Counter(y_a)[0]/Counter(y_a)[1],2)) 
    
    return X_a, y_a

In [44]:
#SMOTE
X_s, y_s = SMOTE_sampling(X_train_scaled,y_train,over_amt=2000,under_amt=2000)      

Class distribution before sampling: Counter({0: 3225, 1: 1238})
Ratio of class distribution before sampling : 2.61
Class distribution after sampling: Counter({0: 2000, 1: 2000})
Ratio of class distribution after sampling : 1.0


In [45]:
#BorderlineSMOTE
X_bs, y_bs = SMOTE_sampling(X_train_scaled,y_train,over_amt=2000,under_amt=2000)      

Class distribution before sampling: Counter({0: 3225, 1: 1238})
Ratio of class distribution before sampling : 2.61
Class distribution after sampling: Counter({0: 2000, 1: 2000})
Ratio of class distribution after sampling : 1.0


In [46]:
#ADASYN
X_a, y_a = SMOTE_sampling(X_train_scaled,y_train,over_amt=2000,under_amt=None)      

Class distribution before sampling: Counter({0: 3225, 1: 1238})
Ratio of class distribution before sampling : 2.61
Class distribution after sampling: Counter({0: 3225, 1: 2000})
Ratio of class distribution after sampling : 1.61


In [None]:
#create new column for target
# import numpy as np
# df["SPECKLE"] = np.where(df["DELTA"]==0,0,1)
# cols = df.columns.tolist()
# cols = [cols[-1]]+cols[:-1] #move speckle col to the front
# df = df.reindex(columns=cols)

In [None]:
# df.to_csv(path+"SNR_R2_ww35.2_Speckle.csv",index=False)

In [None]:
# supporting_fs = ['ULT@MIDAS_6261_U1', 'SORTLOT', 'SORTLOT7', 'WAFER',
#        'XLOC', 'YLOC', 'ULT@MIDAS_6261_U2', 'IB@6261[CLASSHOT]',
#        'FB@6261[CLASSHOT]','ARR_SPECKLE_SCREEN::PHYTON_X_USERFUNC_K_END_X_X_X_X_PBISTSPECKLE_INDICATOR::DUTCPUBAD_1@6261[CLASSHOT]',
#        'TEST RESULTS', 'TEST RESULTS BITS', 'TR_BITS', 'INCOMING',
#        'INCOMING BITS', 'INC_BITS', 'OUTGOING', 'OUTGOING BITS', 'OUT_BITS','DELTA']
# df1 = df.drop(supporting_fs,axis=1) 
# df1.head()

In [None]:
import spacy
spacy.cli.download("en_core_web_sm")

### Feature Selection

### Model Building