### 1) Data Preparation

In [1]:
import pandas as pd
path = "C:/Users/nchong/OneDrive - Intel Corporation/Documents/ML based speckle POC/"
df = pd.read_csv(path+"SNR_R4_ww49.1_Speckle.csv.gz")

In [None]:
#create new column for target
# import numpy as np
# df["SPECKLE"] = np.where(df["DELTA"]==0,0,1)
# cols = df.columns.tolist()
# cols = [cols[-1]]+cols[:-1] #move speckle col to the front
# df = df.reindex(columns=cols)

In [2]:
df["DELTA"].value_counts()

0    13390
1     6317
2      129
Name: DELTA, dtype: int64

In [3]:
df["SPECKLE"].value_counts()

0    13390
1     6446
Name: SPECKLE, dtype: int64

In [None]:
df.head()

In [4]:
pd.options.display.max_seq_items = 2000
df.columns

Index(['SPECKLE', 'VID', 'ULT@MIDAS_6261_U1', 'SORTLOT', 'SORTLOT7', 'WAFER',
       'XLOC', 'YLOC', 'ULT@MIDAS_6261_U2', 'IB@6261[CLASSHOT]',
       'FB@6261[CLASSHOT]',
       'ARR_SPECKLE_SCREEN::PHYTON_X_USERFUNC_K_END_X_X_X_X_PBISTSPECKLE_INDICATOR::DUTCPUBAD_1@6261[CLASSHOT]',
       'TEST RESULTS', 'TEST RESULTS BITS', 'TR_BITS', 'INCOMING',
       'INCOMING BITS', 'INC_BITS', 'OUTGOING', 'OUTGOING BITS', 'OUT_BITS',
       'DELTA', 'IDV_0001_SVT3GNES12_FULLDIE_SOC_0650_MED@132110',
       'IDV_0001_SVT3GNES12_FULLDIE_SOC_0950_MED@132110',
       'IDV_0001_SVT3GNES12_FULLDIE_SOC_TALL_0650_MED@132110',
       'IDV_0001_SVT3GNES12_FULLDIE_SOC_TALL_0950_MED@132110',
       'IDV_0002_SVT3GNES12_FULLDIE_SOC_0650_MED@132110',
       'IDV_0002_SVT3GNES12_FULLDIE_SOC_0950_MED@132110',
       'IDV_0002_SVT3GNES12_FULLDIE_SOC_TALL_0650_MED@132110',
       'IDV_0002_SVT3GNES12_FULLDIE_SOC_TALL_0950_MED@132110',
       'IDV_0003_SVT3GSEM12_FULLDIE_SOC_0650_MED@132110',
       'IDV_0003_SVT3

In [None]:
df.shape

In [None]:
df.describe()

### Duplicate handling

In [5]:
def duplicates(df,keep):
    """
    Check for duplicates in VID and remove duplicates if there is any
    df[DataFrame]: input df
    keep: {‘first’, ‘last’, False}: Determines which duplicates (if any) to keep. 
    - first : Drop duplicates except for the first occurrence. 
    - last : Drop duplicates except for the last occurrence. 
    - False : Drop all duplicates.
    """
    if df['VID'].duplicated().any():
        print("There are duplicates in VID")
        df_dup = df[df.duplicated(subset=['VID'])] #get the duplicates in df 
        display(df_dup)
        df = df.drop_duplicates(subset=['VID'],keep=keep) #remove duplicates from df
        return df
    else:
        print("There are no duplicates in VID")
    

In [6]:
duplicates(df,keep="first")

There are no duplicates in VID


### NA handling

In [7]:
df.shape

(19836, 1615)

In [8]:
# check column for nulls
def check_colna(df):
    """
    Check each column for nulls. Returns Feature, total null and % of null for each column
    df[DataFrame]: dataframe
    
    """
    colna_df = pd.DataFrame(columns =["Feature", "Total Null", "% of Null"])
    for col in df.columns: 
        #checking if there is any null in the column

        if df[col].isnull().sum()>0: 
            
            # if null present, total number of null in the column stores here
            total_null = df[col].isnull().sum() 
            new_row = {'Feature':col, 'Total Null':total_null, '% of Null':round(total_null*100/len(df),2)}
            #append row to the dataframe
            colna_df = colna_df.append(new_row, ignore_index=True)
            
    colna_df= colna_df.sort_values("% of Null", ascending=False)    
    return colna_df  

# check rows for nulls
def check_rowna(df,supporting_fs):  
    """
    Check each row for nulls.Returns VID, total null and % of null for each row
    df[DataFrame]: dataframe
    supporting_fs[list]: all features not used for ML except VID and SPECKLE(target)
    """
    df = df.drop(supporting_fs,axis=1)    
    colrow_df = pd.DataFrame(columns =["SPECKLE","VID", "Total Null", "% of Null"])
    for i in df.index: 
        #checking if there is any null in the row
        if df.iloc[i].isnull().sum()>0:             
            # if null present, total number of null in the row stores here
            total_null = df.iloc[i].isnull().sum() 
            new_row = {'SPECKLE':df.iloc[i,0],'VID':df.iloc[i,1], 'Total Null':total_null, '% of Null':round(total_null*100/(len(df.columns)-2),2)}
            #append row to the dataframe
            colrow_df = colrow_df.append(new_row, ignore_index=True)
            
    colrow_df= colrow_df.sort_values("% of Null", ascending=False)    
    return colrow_df   

# Drop columns based on NA threshold limit
def drop_NAcol(df,NA_limit):
    '''
    Drops columns based on proportion of NA in column
    df[DataFrame]: df
    NA_limit[float/int]: Columns with proportion of NA above NA_limit will be dropped
    '''
    threshold = len(df)*(1-NA_limit)
    df=df.dropna(axis=1, thresh=threshold)
    print(df.shape)
    return df


#### 1) Check column for nulls

In [9]:
#check column na for whole df
colna_df =check_colna(df)
colna_df

Unnamed: 0,Feature,Total Null,% of Null
1230,SIO_LKG_X::HI1P05_X_LKG_K_END_X_X_VMAX_X_1MA_P...,16129,81.31
0,IDV_0001_SVT3GNES12_FULLDIE_SOC_0650_MED@132110,13847,69.81
399,IDV_0134_PSVT3GVTO12_FULLDIE_TALL_0950_MED@132110,13847,69.81
392,IDV_0134_PSVT3GVTO12_FULLDIE_CORE_TALL_0650_ME...,13847,69.81
393,IDV_0134_PSVT3GVTO12_FULLDIE_CORE_TALL_0950_ME...,13847,69.81
...,...,...,...
1472,SIO_LKG_X::HI1P05_X_LKG_K_END_X_X_VMAX_X_1MA_P...,13459,67.85
1473,SIO_LKG_X::HI1P05_X_LKG_K_END_X_X_VMAX_X_1MA_P...,13459,67.85
1407,SIO_LKG_X::HI1P05_X_LKG_K_END_X_X_VMAX_X_1MA_P...,13459,67.85
1406,SIO_LKG_X::HI1P05_X_LKG_K_END_X_X_VMAX_X_1MA_P...,13459,67.85


In [None]:
# colna_df.to_csv(path+"/DataPreparation/NA_Cols.csv",index=False)

In [10]:
#check column na for speckle
colna_speckle_df =check_colna(df.loc[df['SPECKLE'] == 1])
colna_speckle_df

Unnamed: 0,Feature,Total Null,% of Null
1230,SIO_LKG_X::HI1P05_X_LKG_K_END_X_X_VMAX_X_1MA_P...,5268,81.73
0,IDV_0001_SVT3GNES12_FULLDIE_SOC_0650_MED@132110,4845,75.16
399,IDV_0134_PSVT3GVTO12_FULLDIE_TALL_0950_MED@132110,4845,75.16
392,IDV_0134_PSVT3GVTO12_FULLDIE_CORE_TALL_0650_ME...,4845,75.16
393,IDV_0134_PSVT3GVTO12_FULLDIE_CORE_TALL_0950_ME...,4845,75.16
...,...,...,...
1472,SIO_LKG_X::HI1P05_X_LKG_K_END_X_X_VMAX_X_1MA_P...,4677,72.56
1473,SIO_LKG_X::HI1P05_X_LKG_K_END_X_X_VMAX_X_1MA_P...,4677,72.56
1406,SIO_LKG_X::HI1P05_X_LKG_K_END_X_X_VMAX_X_1MA_P...,4677,72.56
1405,SIO_LKG_X::HI1P05_X_LKG_K_END_X_X_VMAX_X_1MA_P...,4677,72.56


In [11]:
#check column na for non-speckle
colna_nonspeckle_df =check_colna(df.loc[df['SPECKLE'] == 0])
colna_nonspeckle_df

Unnamed: 0,Feature,Total Null,% of Null
1230,SIO_LKG_X::HI1P05_X_LKG_K_END_X_X_VMAX_X_1MA_P...,10861,81.11
875,TPI_SIU_STATIC::SIU_STATIC_AM_K_STRESS_X_X_X_S...,9006,67.26
882,TPI_SIU_STATIC::SIU_STATIC_AM_K_STRESS_X_X_X_S...,9006,67.26
889,TPI_SIU_STATIC::SIU_STATIC_AM_K_STRESS_X_X_X_S...,9006,67.26
888,TPI_SIU_STATIC::SIU_STATIC_AM_K_STRESS_X_X_X_S...,9006,67.26
...,...,...,...
820,TPI_SIU_STATIC::SIU_STATIC_AM_K_START_X_X_X_SN...,8782,65.59
822,TPI_SIU_STATIC::SIU_STATIC_AM_K_START_X_X_X_SN...,8782,65.59
824,TPI_SIU_STATIC::SIU_STATIC_AM_K_START_X_X_X_SN...,8782,65.59
826,TPI_SIU_STATIC::SIU_STATIC_AM_K_START_X_X_X_SN...,8782,65.59


#### 2) Check row for nulls

In [12]:
supporting_fs = ['ULT@MIDAS_6261_U1', 'SORTLOT', 'SORTLOT7', 'WAFER',
       'XLOC', 'YLOC', 'ULT@MIDAS_6261_U2', 'IB@6261[CLASSHOT]',
       'FB@6261[CLASSHOT]','ARR_SPECKLE_SCREEN::PHYTON_X_USERFUNC_K_END_X_X_X_X_PBISTSPECKLE_INDICATOR::DUTCPUBAD_1@6261[CLASSHOT]',
       'TEST RESULTS', 'TEST RESULTS BITS', 'TR_BITS', 'INCOMING',
       'INCOMING BITS', 'INC_BITS', 'OUTGOING', 'OUTGOING BITS', 'OUT_BITS','DELTA']
rowna_df = check_rowna(df,supporting_fs)
rowna_df

Unnamed: 0,SPECKLE,VID,Total Null,% of Null
8213,0,M1GC346401871,1593,100.00
10375,0,M11H9B4300835,1593,100.00
10377,0,M11H9B4301228,1593,100.00
10378,0,M11H9B4300032,1593,100.00
10379,0,M11H9B4301227,1593,100.00
...,...,...,...,...
11806,0,M05C0K5600161,1,0.06
11811,0,M0U25B1100396,1,0.06
11814,0,M0U25B1100907,1,0.06
11815,0,M0U25B1100798,1,0.06


In [13]:
rowna_df.loc[rowna_df["% of Null"] ==100.00]

Unnamed: 0,SPECKLE,VID,Total Null,% of Null
8213,0,M1GC346401871,1593,100.0
10375,0,M11H9B4300835,1593,100.0
10377,0,M11H9B4301228,1593,100.0
10378,0,M11H9B4300032,1593,100.0
10379,0,M11H9B4301227,1593,100.0
...,...,...,...,...
5463,0,M10P67G000951,1593,100.0
5464,0,M10P67G001238,1593,100.0
5468,0,M17U1C0300567,1593,100.0
5465,0,M10P67G000323,1593,100.0


In [None]:
# rowna_df.to_csv(path+"/DataPreparation/NA_Rows_aftercolremoval.csv",index=False)

#### 3) Drop columns based on threshold limit

In [14]:
df = drop_NAcol(df,0.8) #drop columns with >80% NA 

(19836, 1614)


In [15]:
check_colna(df)

Unnamed: 0,Feature,Total Null,% of Null
0,IDV_0001_SVT3GNES12_FULLDIE_SOC_0650_MED@132110,13847,69.81
390,IDV_0133_PSVT3GVTO12_FULLDIE_TALL_0650_MED@132110,13847,69.81
392,IDV_0134_PSVT3GVTO12_FULLDIE_CORE_TALL_0650_ME...,13847,69.81
393,IDV_0134_PSVT3GVTO12_FULLDIE_CORE_TALL_0950_ME...,13847,69.81
394,IDV_0134_PSVT3GVTO12_FULLDIE_SOC_0650_MED@132110,13847,69.81
...,...,...,...
1471,SIO_LKG_X::HI1P05_X_LKG_K_END_X_X_VMAX_X_1MA_P...,13459,67.85
1472,SIO_LKG_X::HI1P05_X_LKG_K_END_X_X_VMAX_X_1MA_P...,13459,67.85
1406,SIO_LKG_X::HI1P05_X_LKG_K_END_X_X_VMAX_X_1MA_P...,13459,67.85
1405,SIO_LKG_X::HI1P05_X_LKG_K_END_X_X_VMAX_X_1MA_P...,13459,67.85


In [16]:
supporting_fs = ['ULT@MIDAS_6261_U1', 'SORTLOT', 'SORTLOT7', 'WAFER',
       'XLOC', 'YLOC', 'ULT@MIDAS_6261_U2', 'IB@6261[CLASSHOT]',
       'FB@6261[CLASSHOT]','ARR_SPECKLE_SCREEN::PHYTON_X_USERFUNC_K_END_X_X_X_X_PBISTSPECKLE_INDICATOR::DUTCPUBAD_1@6261[CLASSHOT]',
       'TEST RESULTS', 'TEST RESULTS BITS', 'TR_BITS', 'INCOMING',
       'INCOMING BITS', 'INC_BITS', 'OUTGOING', 'OUTGOING BITS', 'OUT_BITS','DELTA']
rowna_df = check_rowna(df,supporting_fs)
rowna_df

Unnamed: 0,SPECKLE,VID,Total Null,% of Null
7185,0,M1GC346401179,1592,100.00
9472,0,M1WE241501821,1592,100.00
9453,0,M1WE241501337,1592,100.00
9454,0,M1WE241501323,1592,100.00
9455,0,M1WE241500587,1592,100.00
...,...,...,...,...
10827,0,M12T24A201085,1,0.06
4973,0,M10P67G000982,1,0.06
303,1,M1FW102400277,1,0.06
6734,0,M1F84S6201738,1,0.06


In [17]:
rowallNA_df = rowna_df.loc[rowna_df["% of Null"] ==100.00]
rowallNA_df

Unnamed: 0,SPECKLE,VID,Total Null,% of Null
7185,0,M1GC346401179,1592,100.0
9472,0,M1WE241501821,1592,100.0
9453,0,M1WE241501337,1592,100.0
9454,0,M1WE241501323,1592,100.0
9455,0,M1WE241500587,1592,100.0
...,...,...,...,...
4743,1,M1NQ071700560,1592,100.0
4742,1,M1NQ071700139,1592,100.0
5124,0,M10P67G000855,1592,100.0
4741,1,M1NQ071700211,1592,100.0


### Row NA handling

In [18]:
#drop rows with 100% NA
df = df[~df["VID"].isin(rowallNA_df["VID"])]
df

Unnamed: 0,SPECKLE,VID,ULT@MIDAS_6261_U1,SORTLOT,SORTLOT7,WAFER,XLOC,YLOC,ULT@MIDAS_6261_U2,IB@6261[CLASSHOT],...,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDCLK1,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDDATA0,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDDATA1,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TAP_DIS_STRAP_N,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TAP_ODT_EN,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TXT_AGENT,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TXT_PLTEN,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VPPSMBUSSCL,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VPPSMBUSSDA,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VSENSEPMAX
0,1,M10P67G000001,N0286920_293_6_-3,N0286920,N028692,293,6,-3,H0280460_443_2_1,1,...,-8.210000e-07,-8.210000e-07,-8.330000e-07,-2.100000e-07,-2.240000e-07,-1.950000e-07,-1.910000e-07,-8.990000e-07,-8.840000e-07,-1.970000e-07
1,1,M10P67G000962,N0286920_225_5_-5,N0286920,N028692,225,5,-5,H0280460_443_2_18,1,...,-8.670000e-07,-8.870000e-07,-8.740000e-07,-2.110000e-07,-2.100000e-07,-1.900000e-07,-2.280000e-07,-8.620000e-07,-8.610000e-07,-1.920000e-07
2,1,M10P67G000808,N0292860_135_6_0,N0292860,N029286,135,6,0,H0256240_667_-10_6,1,...,-6.000000e-07,-6.500000e-07,-6.380000e-07,-1.440000e-07,-1.380000e-07,-1.440000e-07,-1.620000e-07,-5.820000e-07,-6.150000e-07,-1.460000e-07
3,1,M17U1C0300573,N0293360_159_3_1,N0293360,N029336,159,3,1,H0280460_443_0_-14,1,...,-1.170000e-06,-1.220000e-06,-1.230000e-06,-2.950000e-07,-2.930000e-07,-3.190000e-07,-3.070000e-07,-1.220000e-06,-1.190000e-06,-2.960000e-07
4,1,M10P67G000433,N0293340_045_1_5,N0293340,N029334,45,1,5,H0280460_444_9_13,1,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19825,0,M1G88H4701199,N0326460_212_4_6,N0326460,N032646,212,4,6,H0310990_073_5_-2,1,...,-7.190000e-07,-7.620000e-07,-7.720000e-07,-1.810000e-07,-1.860000e-07,-1.720000e-07,-1.780000e-07,-7.080000e-07,-7.430000e-07,-1.720000e-07
19826,0,M1G88H4700911,N0326460_209_-1_-2,N0326460,N032646,209,-1,-2,H0310990_073_-3_18,1,...,-9.140000e-07,-9.750000e-07,-9.730000e-07,-2.420000e-07,-2.400000e-07,-2.330000e-07,-2.310000e-07,-8.940000e-07,-8.840000e-07,-2.310000e-07
19829,0,M1G88H4700361,N0370830_775_-3_0,N0370830,N037083,775,-3,0,H0310990_083_-14_-3,1,...,-1.010000e-06,-9.770000e-07,-1.000000e-06,-2.500000e-07,-2.510000e-07,-3.010000e-07,-2.310000e-07,-9.300000e-07,-9.430000e-07,-2.310000e-07
19832,0,M1G88H4700053,N0342690_197_1_-6,N0342690,N034269,197,1,-6,H0310990_073_-10_15,14,...,-1.060000e-06,-1.120000e-06,-1.130000e-06,-2.610000e-07,-2.700000e-07,-2.700000e-07,-2.720000e-07,-1.090000e-06,-1.110000e-06,-2.270000e-07


In [34]:
df = df.reset_index(drop=True)
df

Unnamed: 0,SPECKLE,VID,ULT@MIDAS_6261_U1,SORTLOT,SORTLOT7,WAFER,XLOC,YLOC,ULT@MIDAS_6261_U2,IB@6261[CLASSHOT],...,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDCLK1,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDDATA0,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDDATA1,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TAP_DIS_STRAP_N,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TAP_ODT_EN,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TXT_AGENT,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TXT_PLTEN,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VPPSMBUSSCL,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VPPSMBUSSDA,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VSENSEPMAX
0,1,M10P67G000001,N0286920_293_6_-3,N0286920,N028692,293,6,-3,H0280460_443_2_1,1,...,-8.210000e-07,-8.210000e-07,-8.330000e-07,-2.100000e-07,-2.240000e-07,-1.950000e-07,-1.910000e-07,-8.990000e-07,-8.840000e-07,-1.970000e-07
1,1,M10P67G000962,N0286920_225_5_-5,N0286920,N028692,225,5,-5,H0280460_443_2_18,1,...,-8.670000e-07,-8.870000e-07,-8.740000e-07,-2.110000e-07,-2.100000e-07,-1.900000e-07,-2.280000e-07,-8.620000e-07,-8.610000e-07,-1.920000e-07
2,1,M10P67G000808,N0292860_135_6_0,N0292860,N029286,135,6,0,H0256240_667_-10_6,1,...,-6.000000e-07,-6.500000e-07,-6.380000e-07,-1.440000e-07,-1.380000e-07,-1.440000e-07,-1.620000e-07,-5.820000e-07,-6.150000e-07,-1.460000e-07
3,1,M17U1C0300573,N0293360_159_3_1,N0293360,N029336,159,3,1,H0280460_443_0_-14,1,...,-1.170000e-06,-1.220000e-06,-1.230000e-06,-2.950000e-07,-2.930000e-07,-3.190000e-07,-3.070000e-07,-1.220000e-06,-1.190000e-06,-2.960000e-07
4,1,M10P67G000433,N0293340_045_1_5,N0293340,N029334,45,1,5,H0280460_444_9_13,1,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6372,0,M1G88H4701199,N0326460_212_4_6,N0326460,N032646,212,4,6,H0310990_073_5_-2,1,...,-7.190000e-07,-7.620000e-07,-7.720000e-07,-1.810000e-07,-1.860000e-07,-1.720000e-07,-1.780000e-07,-7.080000e-07,-7.430000e-07,-1.720000e-07
6373,0,M1G88H4700911,N0326460_209_-1_-2,N0326460,N032646,209,-1,-2,H0310990_073_-3_18,1,...,-9.140000e-07,-9.750000e-07,-9.730000e-07,-2.420000e-07,-2.400000e-07,-2.330000e-07,-2.310000e-07,-8.940000e-07,-8.840000e-07,-2.310000e-07
6374,0,M1G88H4700361,N0370830_775_-3_0,N0370830,N037083,775,-3,0,H0310990_083_-14_-3,1,...,-1.010000e-06,-9.770000e-07,-1.000000e-06,-2.500000e-07,-2.510000e-07,-3.010000e-07,-2.310000e-07,-9.300000e-07,-9.430000e-07,-2.310000e-07
6375,0,M1G88H4700053,N0342690_197_1_-6,N0342690,N034269,197,1,-6,H0310990_073_-10_15,14,...,-1.060000e-06,-1.120000e-06,-1.130000e-06,-2.610000e-07,-2.700000e-07,-2.700000e-07,-2.720000e-07,-1.090000e-06,-1.110000e-06,-2.270000e-07


In [35]:
df["SPECKLE"].value_counts()

0    4608
1    1769
Name: SPECKLE, dtype: int64

In [36]:
#check row na
supporting_fs = ['ULT@MIDAS_6261_U1', 'SORTLOT', 'SORTLOT7', 'WAFER',
       'XLOC', 'YLOC', 'ULT@MIDAS_6261_U2', 'IB@6261[CLASSHOT]',
       'FB@6261[CLASSHOT]','ARR_SPECKLE_SCREEN::PHYTON_X_USERFUNC_K_END_X_X_X_X_PBISTSPECKLE_INDICATOR::DUTCPUBAD_1@6261[CLASSHOT]',
       'TEST RESULTS', 'TEST RESULTS BITS', 'TR_BITS', 'INCOMING',
       'INCOMING BITS', 'INC_BITS', 'OUTGOING', 'OUTGOING BITS', 'OUT_BITS','DELTA']
check_rowna(df,supporting_fs)

Unnamed: 0,SPECKLE,VID,Total Null,% of Null
215,1,M1888SS100706,1309,82.22
220,1,M1888SS100614,1309,82.22
213,1,M1888SS100613,1309,82.22
214,1,M1888SS100703,1309,82.22
216,1,M1888SS100708,1309,82.22
...,...,...,...,...
512,0,M1F84S6201738,1,0.06
290,0,M10P67G000982,1,0.06
745,0,M12T24A201085,1,0.06
560,0,M1DY252001944,1,0.06


In [37]:
df = df.drop(['PTH_POWER::POWER_X_SCREEN_E_BEGIN_X_X_X_X_CALC_PP_CDYN_INDICATOR_CDYN_DATA@132110'],axis=1)

In [38]:
df.shape

(6377, 1613)

In [39]:
#NA imputation
def NA_impute(df,imptype):
    """
    Impute NA
    df[DataFrame]:df
    imptype[string]: "mean" to impute data with mean, "median" to impute data with median
    """
    if imptype == "mean":        
        df = df.fillna(df.mean())
    if imptype == "median":
        df = df.fillna(df.median())
    return df

In [40]:
df = NA_impute(df,imptype="median")
df

Unnamed: 0,SPECKLE,VID,ULT@MIDAS_6261_U1,SORTLOT,SORTLOT7,WAFER,XLOC,YLOC,ULT@MIDAS_6261_U2,IB@6261[CLASSHOT],...,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDCLK1,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDDATA0,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDDATA1,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TAP_DIS_STRAP_N,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TAP_ODT_EN,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TXT_AGENT,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TXT_PLTEN,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VPPSMBUSSCL,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VPPSMBUSSDA,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VSENSEPMAX
0,1,M10P67G000001,N0286920_293_6_-3,N0286920,N028692,293,6,-3,H0280460_443_2_1,1,...,-8.210000e-07,-8.210000e-07,-8.330000e-07,-2.100000e-07,-2.240000e-07,-1.950000e-07,-1.910000e-07,-8.990000e-07,-8.840000e-07,-1.970000e-07
1,1,M10P67G000962,N0286920_225_5_-5,N0286920,N028692,225,5,-5,H0280460_443_2_18,1,...,-8.670000e-07,-8.870000e-07,-8.740000e-07,-2.110000e-07,-2.100000e-07,-1.900000e-07,-2.280000e-07,-8.620000e-07,-8.610000e-07,-1.920000e-07
2,1,M10P67G000808,N0292860_135_6_0,N0292860,N029286,135,6,0,H0256240_667_-10_6,1,...,-6.000000e-07,-6.500000e-07,-6.380000e-07,-1.440000e-07,-1.380000e-07,-1.440000e-07,-1.620000e-07,-5.820000e-07,-6.150000e-07,-1.460000e-07
3,1,M17U1C0300573,N0293360_159_3_1,N0293360,N029336,159,3,1,H0280460_443_0_-14,1,...,-1.170000e-06,-1.220000e-06,-1.230000e-06,-2.950000e-07,-2.930000e-07,-3.190000e-07,-3.070000e-07,-1.220000e-06,-1.190000e-06,-2.960000e-07
4,1,M10P67G000433,N0293340_045_1_5,N0293340,N029334,45,1,5,H0280460_444_9_13,1,...,-8.520000e-07,-8.570000e-07,-8.640000e-07,-2.150000e-07,-2.250000e-07,-2.190000e-07,-2.170000e-07,-8.545000e-07,-8.510000e-07,-2.130000e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6372,0,M1G88H4701199,N0326460_212_4_6,N0326460,N032646,212,4,6,H0310990_073_5_-2,1,...,-7.190000e-07,-7.620000e-07,-7.720000e-07,-1.810000e-07,-1.860000e-07,-1.720000e-07,-1.780000e-07,-7.080000e-07,-7.430000e-07,-1.720000e-07
6373,0,M1G88H4700911,N0326460_209_-1_-2,N0326460,N032646,209,-1,-2,H0310990_073_-3_18,1,...,-9.140000e-07,-9.750000e-07,-9.730000e-07,-2.420000e-07,-2.400000e-07,-2.330000e-07,-2.310000e-07,-8.940000e-07,-8.840000e-07,-2.310000e-07
6374,0,M1G88H4700361,N0370830_775_-3_0,N0370830,N037083,775,-3,0,H0310990_083_-14_-3,1,...,-1.010000e-06,-9.770000e-07,-1.000000e-06,-2.500000e-07,-2.510000e-07,-3.010000e-07,-2.310000e-07,-9.300000e-07,-9.430000e-07,-2.310000e-07
6375,0,M1G88H4700053,N0342690_197_1_-6,N0342690,N034269,197,1,-6,H0310990_073_-10_15,14,...,-1.060000e-06,-1.120000e-06,-1.130000e-06,-2.610000e-07,-2.700000e-07,-2.700000e-07,-2.720000e-07,-1.090000e-06,-1.110000e-06,-2.270000e-07


In [41]:
check_rowna(df,supporting_fs)

Unnamed: 0,SPECKLE,VID,Total Null,% of Null


### Handling unary columns

In [42]:
def unary(df):
    """
    Checks for unary columns. If there are unary columns, the unary columns will be printed and removed from the df
    df[DataFrame]: input dataframe
    """
    unarycolumns = [col for col in df.columns if len(df[col].unique())==1]
    if unarycolumns:
        print("The unary column are:",unarycolumns)
        df = df.drop(unarycolumns,axis=1)
        print("Unary columns dropped!")
        return df 
    else:
        print("There are no unary columns!")

In [43]:
unary(df)

There are no unary columns!


In [None]:
# supporting_fs = ['SPECKLE','VID','ULT@MIDAS_6261_U1', 'SORTLOT', 'SORTLOT7', 'WAFER',
#        'XLOC', 'YLOC', 'ULT@MIDAS_6261_U2', 'IB@6261[CLASSHOT]',
#        'FB@6261[CLASSHOT]','ARR_SPECKLE_SCREEN::PHYTON_X_USERFUNC_K_END_X_X_X_X_PBISTSPECKLE_INDICATOR::DUTCPUBAD_1@6261[CLASSHOT]',
#        'TEST RESULTS', 'TEST RESULTS BITS', 'TR_BITS', 'INCOMING',
#        'INCOMING BITS', 'INC_BITS', 'OUTGOING', 'OUTGOING BITS', 'OUT_BITS','DELTA']
# sf_df = df[supporting_fs]
# df1 = df.drop(supporting_fs,axis=1) 
# df1.head()

In [None]:
# df1.columns.difference(df2.columns)

In [None]:
# df1['PTH_POWER::POWER_X_SCREEN_E_BEGIN_X_X_X_X_CALC_PP_CDYN_INDICATOR_CDYN_DATA@132110']

In [None]:
# df2 = df1.drop(['PTH_POWER::POWER_X_SCREEN_E_BEGIN_X_X_X_X_CALC_PP_CDYN_INDICATOR_CDYN_DATA@132110'],axis=1)
# df_neg = df2.loc[:,((df2 == -555).any()) | ((df2 == -999).any())]

In [None]:
# df_neg_sf = pd.concat([sf_df,df_neg],axis=1)
# df_neg_sf

In [None]:
# df_neg_sf.to_csv(path+"/DataPreparation/Negative_Cols_555_999.csv",index=False)

### Train-validation split

In [44]:
from sklearn.model_selection import train_test_split
from collections import Counter
def randomsamp(df,val_size):
    """
    Split whole dataset into train and validation using random sampling
    Returns X_train, X_val, y_train, y_val
    df[DataFrame]: input dataframe    
    val_size[float]:Should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the val split. 
                    Train size is complement of val size
    """
    X= df.drop(["SPECKLE"],axis=1)
    y= df["SPECKLE"] 
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = val_size,random_state=42,stratify= df["SPECKLE"])
    if type(val_size)==float:
        print("Train-val split completed with",(1-val_size)*100,"-",val_size*100,"split in train-val")
    print("Shape of X_train is:", X_train.shape)
    print("Shape of X_val is:",X_val.shape)
    print("Shape of y_train is:",y_train.shape)
    print("Shape of y_val is:",y_val.shape)
    print("Distribution of y_train:",Counter(y_train))
    print("Distribution of y_val:",Counter(y_val))
    
    return X_train, X_val, y_train, y_val

def targetrandomsamp(df,speckle_test_size,nonspeckle_test_size):
    """
    Dataset is split into speckle/non-speckle first. The speckle/non-speckle datasets are then split 
    into train and validation using random sampling, followed by merging of the speckle/non-speckle  
    to return X_train, X_val, y_train, y_val
    
    df[DataFrame]: input dataframe  
    
    speckle_test_size[float/int]: Val size for speckle 
    -If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the val split. 
    -If int, represents the absolute number of val samples.
    -Train size is complement of val size
    
    nonspeckle_test_size[float/int]: Val size for non-speckle 
    -If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the val split. 
    -If int, represents the absolute number of val samples.
    -Train size is complement of val size
    
    """
    #split data into speckle/nonspeckle
    df_s = df.loc[df["SPECKLE"] == 1] #speckle
    df_ns = df.loc[df["SPECKLE"] == 0] #nonspeckle
    
    #split speckle/nonspeckle data into train/validation
    print("For speckle data:")
    X_train_s, X_val_s, y_train_s, y_val_s = randomsamp(df_s,val_size=speckle_test_size)
    print("\nFor non-speckle data:")
    X_train_ns, X_val_ns, y_train_ns, y_val_ns = randomsamp(df_ns,val_size=nonspeckle_test_size)
    
    #concat the speckle/non-speckle train and validation
    X_train = pd.concat([X_train_s,X_train_ns], ignore_index=True)
    X_val = pd.concat([X_val_s,X_val_ns], ignore_index=True)
    y_train = pd.concat([y_train_s,y_train_ns], ignore_index=True)
    y_val = pd.concat([y_val_s,y_val_ns], ignore_index=True)
    
    print("\nFinal dataset:")
    print("Distribution of y_train:",Counter(y_train))
    print("Distribution of y_val:",Counter(y_val))
    return X_train, X_val, y_train, y_val

In [45]:
df["SPECKLE"].value_counts()

0    4608
1    1769
Name: SPECKLE, dtype: int64

In [53]:
#train-test split on whole data
X_train, X_val, y_train, y_val = randomsamp(df,val_size=0.3)

Train-val split completed with 70.0 - 30.0 split in train-val
Shape of X_train is: (4463, 1612)
Shape of X_val is: (1914, 1612)
Shape of y_train is: (4463,)
Shape of y_val is: (1914,)
Distribution of y_train: Counter({0: 3225, 1: 1238})
Distribution of y_val: Counter({0: 1383, 1: 531})


In [49]:
#train-test split on speckle/non-speckle then merge
X_train, X_val, y_train, y_val = targetrandomsamp(df,speckle_test_size=500,nonspeckle_test_size=2000)

For speckle data:
Shape of X_train is: (1269, 1612)
Shape of X_val is: (500, 1612)
Shape of y_train is: (1269,)
Shape of y_val is: (500,)
Distribution of y_train: Counter({1: 1269})
Distribution of y_val: Counter({1: 500})

For non-speckle data:
Shape of X_train is: (2608, 1612)
Shape of X_val is: (2000, 1612)
Shape of y_train is: (2608,)
Shape of y_val is: (2000,)
Distribution of y_train: Counter({0: 2608})
Distribution of y_val: Counter({0: 2000})

Final dataset:
Distribution of y_train: Counter({0: 2608, 1: 1269})
Distribution of y_val: Counter({0: 2000, 1: 500})


### 2) Sampling to address class imbalance

In [51]:
 #to change later after adding scaling
X_train_scaled = X_train

In [55]:
Counter(y_train)

Counter({0: 3225, 1: 1238})

In [78]:
from imblearn.over_sampling import SMOTE 
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.over_sampling import ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from collections import Counter

#SMOTE
def SMOTE_sampling(X_train_scaled,y_train,supporting_fs,over_amt,under_amt=None):
    """
    Can choose SMOTE or SMOTE + random undersampling
    X_train_scaled[DataFrame]: X_train after scaling
    y_train[DataFrame] : target in train data
    supporting_fs[list] : list of supporting features to be removed from X_train_scaled
    over_amt [int] : amount of synthetic data to be generated with SMOTE
    under_amt[int/None]: int - amount of nonspeckle data after undersampling
                         None - No undersampling is done
    
    """
    X_train_scaled = X_train_scaled.drop(supporting_fs,axis=1)
    print("Class distribution before sampling:", Counter(y_train))
    print("Ratio of class distribution before sampling :",round(Counter(y_train)[0]/Counter(y_train)[1],2))
    
    if under_amt == None: #SMOTE only
        sm = SMOTE(sampling_strategy= {1:over_amt},random_state=42)
        X_s, y_s = sm.fit_resample(X_train_scaled, y_train)
        
    else: #SMOTE + random undersampling
        sm = SMOTE(sampling_strategy={1:over_amt},random_state=42)
        under = RandomUnderSampler(sampling_strategy={0:under_amt},random_state=42)
        pipeline = Pipeline(steps=[('o', sm), ('u', under)])
        X_s, y_s = pipeline.fit_resample(X_train_scaled, y_train)    
         
    print("Class distribution after sampling:", Counter(y_s))     
    print("Ratio of class distribution after sampling :",round(Counter(y_s)[0]/Counter(y_s)[1],2)) 
    
    return X_s, y_s

#BorderlineSMOTE
def BorderlineSMOTE_sampling(X_train_scaled,y_train,supporting_fs,over_amt,under_amt=None):
    """
    Can choose BorderlineSMOTE or BorderlineSMOTE + random undersampling
    X_train_scaled[DataFrame]: X_train after scaling
    y_train[DataFrame] : target in train data
    supporting_fs[list] : list of supporting features to be removed from X_train_scaled
    over_amt [int] : amount of synthetic data to be generated with BorderlineSMOTE
    under_amt[int/None]: int - amount of nonspeckle data after undersampling
                         None - No undersampling is done
    
    """
    X_train_scaled = X_train_scaled.drop(supporting_fs,axis=1)
    print("Class distribution before sampling:", Counter(y_train))
    print("Ratio of class distribution before sampling :",round(Counter(y_train)[0]/Counter(y_train)[1],2))
    
    if under_amt == None: #BorderlineSMOTE only
        bsm = BorderlineSMOTE(sampling_strategy= {1:over_amt},random_state=42)
        X_bs, y_bs = bsm.fit_resample(X_train_scaled, y_train)
        
    else: #BorderlineSMOTE + random undersampling
        bsm = BorderlineSMOTE(sampling_strategy={1:over_amt},random_state=42)
        under = RandomUnderSampler(sampling_strategy={0:under_amt},random_state=42)
        pipeline = Pipeline(steps=[('o', bsm), ('u', under)])
        X_bs, y_bs = pipeline.fit_resample(X_train_scaled, y_train)    
         
    print("Class distribution after sampling:", Counter(y_bs))     
    print("Ratio of class distribution after sampling :",round(Counter(y_bs)[0]/Counter(y_bs)[1],2)) 
    
    return X_bs, y_bs

#ADASYN
def ADASYN_sampling(X_train_scaled,y_train,supporting_fs,over_amt,under_amt=None):
    """
    Can choose ADASYN or ADASYN + random undersampling
    X_train_scaled[DataFrame]: X_train after scaling
    y_train[DataFrame] : target in train data
    supporting_fs[list] : list of supporting features to be removed from X_train_scaled
    over_amt [int] : amount of synthetic data to be generated with ADASYN
    under_amt[int/None]: int - amount of nonspeckle data after undersampling
                         None - No undersampling is done
    
    """
    X_train_scaled = X_train_scaled.drop(supporting_fs,axis=1)
    print("Class distribution before sampling:", Counter(y_train))
    print("Ratio of class distribution before sampling :",round(Counter(y_train)[0]/Counter(y_train)[1],2))
    
    if under_amt == None: #ADASYN only
        ad = ADASYN(sampling_strategy= {1:over_amt},random_state=42)
        X_a, y_a = ad.fit_resample(X_train_scaled, y_train)
        
    else: #ADASYN + random undersampling
        ad = ADASYN(sampling_strategy={1:over_amt},random_state=42)
        under = RandomUnderSampler(sampling_strategy={0:under_amt},random_state=42)
        pipeline = Pipeline(steps=[('o', ad), ('u', under)])
        X_a, y_a = pipeline.fit_resample(X_train_scaled, y_train)    
         
    print("Class distribution after sampling:", Counter(y_a))     
    print("Ratio of class distribution after sampling :",round(Counter(y_a)[0]/Counter(y_a)[1],2)) 
    
    return X_a, y_a

In [71]:
#SMOTE
supporting_fs = ['VID','ULT@MIDAS_6261_U1', 'SORTLOT', 'SORTLOT7', 'WAFER',
       'XLOC', 'YLOC', 'ULT@MIDAS_6261_U2', 'IB@6261[CLASSHOT]',
       'FB@6261[CLASSHOT]','ARR_SPECKLE_SCREEN::PHYTON_X_USERFUNC_K_END_X_X_X_X_PBISTSPECKLE_INDICATOR::DUTCPUBAD_1@6261[CLASSHOT]',
       'TEST RESULTS', 'TEST RESULTS BITS', 'TR_BITS', 'INCOMING',
       'INCOMING BITS', 'INC_BITS', 'OUTGOING', 'OUTGOING BITS', 'OUT_BITS','DELTA']

X_s, y_s = SMOTE_sampling(X_train_scaled,y_train,supporting_fs,over_amt=2000,under_amt=2000)      

Class distribution before sampling: Counter({0: 3225, 1: 1238})
Ratio of class distribution before sampling : 2.61
Class distribution after sampling: Counter({0: 2000, 1: 2000})
Ratio of class distribution after sampling : 1.0


In [80]:
#BorderlineSMOTE
supporting_fs = ['VID','ULT@MIDAS_6261_U1', 'SORTLOT', 'SORTLOT7', 'WAFER',
       'XLOC', 'YLOC', 'ULT@MIDAS_6261_U2', 'IB@6261[CLASSHOT]',
       'FB@6261[CLASSHOT]','ARR_SPECKLE_SCREEN::PHYTON_X_USERFUNC_K_END_X_X_X_X_PBISTSPECKLE_INDICATOR::DUTCPUBAD_1@6261[CLASSHOT]',
       'TEST RESULTS', 'TEST RESULTS BITS', 'TR_BITS', 'INCOMING',
       'INCOMING BITS', 'INC_BITS', 'OUTGOING', 'OUTGOING BITS', 'OUT_BITS','DELTA']

X_bs, y_bs = SMOTE_sampling(X_train_scaled,y_train,supporting_fs,over_amt=2000,under_amt=2000)      

Class distribution before sampling: Counter({0: 3225, 1: 1238})
Ratio of class distribution before sampling : 2.61
Class distribution after sampling: Counter({0: 2000, 1: 2000})
Ratio of class distribution after sampling : 1.0


In [82]:
#ADASYN
supporting_fs = ['VID','ULT@MIDAS_6261_U1', 'SORTLOT', 'SORTLOT7', 'WAFER',
       'XLOC', 'YLOC', 'ULT@MIDAS_6261_U2', 'IB@6261[CLASSHOT]',
       'FB@6261[CLASSHOT]','ARR_SPECKLE_SCREEN::PHYTON_X_USERFUNC_K_END_X_X_X_X_PBISTSPECKLE_INDICATOR::DUTCPUBAD_1@6261[CLASSHOT]',
       'TEST RESULTS', 'TEST RESULTS BITS', 'TR_BITS', 'INCOMING',
       'INCOMING BITS', 'INC_BITS', 'OUTGOING', 'OUTGOING BITS', 'OUT_BITS','DELTA']

X_a, y_a = SMOTE_sampling(X_train_scaled,y_train,supporting_fs,over_amt=2000,under_amt=None)      

Class distribution before sampling: Counter({0: 3225, 1: 1238})
Ratio of class distribution before sampling : 2.61
Class distribution after sampling: Counter({0: 3225, 1: 2000})
Ratio of class distribution after sampling : 1.61


In [None]:
#create new column for target
# import numpy as np
# df["SPECKLE"] = np.where(df["DELTA"]==0,0,1)
# cols = df.columns.tolist()
# cols = [cols[-1]]+cols[:-1] #move speckle col to the front
# df = df.reindex(columns=cols)

In [None]:
# df.to_csv(path+"SNR_R2_ww35.2_Speckle.csv",index=False)

In [None]:
# supporting_fs = ['ULT@MIDAS_6261_U1', 'SORTLOT', 'SORTLOT7', 'WAFER',
#        'XLOC', 'YLOC', 'ULT@MIDAS_6261_U2', 'IB@6261[CLASSHOT]',
#        'FB@6261[CLASSHOT]','ARR_SPECKLE_SCREEN::PHYTON_X_USERFUNC_K_END_X_X_X_X_PBISTSPECKLE_INDICATOR::DUTCPUBAD_1@6261[CLASSHOT]',
#        'TEST RESULTS', 'TEST RESULTS BITS', 'TR_BITS', 'INCOMING',
#        'INCOMING BITS', 'INC_BITS', 'OUTGOING', 'OUTGOING BITS', 'OUT_BITS','DELTA']
# df1 = df.drop(supporting_fs,axis=1) 
# df1.head()

### Feature Selection

### Model Building