### A) Data Preparation

In [None]:
n

In [None]:
df.shape

#### Check and remove rows with blank delta

In [None]:
#check for rows with blank delta
import numpy as np
df1 = df[df["DELTA"].isnull()]
df1

In [None]:
#make folder for dataprep if it doesn't exist
import os
dataprep_path = path + 'DataPreparation/' 
if not os.path.exists(dataprep_path):
    os.makedirs(dataprep_path)
    print("DataPreparation folder created!")

In [None]:
df1.to_csv(dataprep_path+"Blank_delta.csv",index=False)

In [None]:
#remove rows with blank delta
import numpy as np
df = df[df["DELTA"].notnull()]
df

In [None]:
df["DELTA"].value_counts()

In [None]:
#create new column SPECKLE as target
import numpy as np
df["SPECKLE"] = np.where(df["DELTA"]==0,0,1)
cols = df.columns.tolist()
cols = [cols[-1]]+cols[:-1] #move speckle col to the front
df = df.reindex(columns=cols)

In [None]:
df["SPECKLE"].value_counts()

In [None]:
df.head()

In [None]:
# pd.options.display.max_seq_items = 2000
# df.columns

In [None]:
df = df.reset_index(drop=True)
df.shape

In [None]:
# df.describe()

In [None]:
df.to_csv(dataprep_path+"SNR_R5_ww51.4_Blankdeltarem.csv",index=False)

### 1) Check for VID duplicates (Unique VID)

In [None]:
df.shape

In [None]:
df["SPECKLE"].value_counts()

In [None]:
def check_duplicates(df):
    """
    Check for duplicates in VID 
    df[DataFrame]: input df    
    """
    if df['VID'].duplicated().any():
        print("There are duplicates in VID")
        df_dup = df[df.duplicated(subset=['VID'])] #get the duplicates in df, returns repeated rows only 
        df= df[df["VID"].isin(df_dup["VID"])] #get all the duplicates in df, returns original + repeated rows 
        return df
    else:
        print("There are no duplicates in VID")
    

In [None]:
df_dup = check_duplicates(df)
df_dup #78 rows of duplicates, 39 duplicated VIDs -> ONLY 1 VID has values, the rest all blanks

In [None]:
df_dup.to_csv(dataprep_path+"SNR_R5_ww51.4_DuplicateVID.csv",index=False)

In [None]:
#only duplicate VID M1BH477200042 has values, the rest all NAN
df_dup_withval = df.loc[df['VID'] == 'M1BH477200042']
df_dup_withval

In [None]:
def duplicates_handling(df,keep):
    """
    Remove duplicates if there is any
    df[DataFrame]: input df
    keep: {‘first’, ‘last’, False}: Determines which duplicates (if any) to keep. 
    - first : Drop duplicates except for the first occurrence. 
    - last : Drop duplicates except for the last occurrence. 
    - False : Drop all duplicates.
    """
    df = df.drop_duplicates(subset=['VID'],keep=keep) #remove duplicates from df
#     display(df)
    return df

In [None]:
#drop all duplicates for 38 VIDs, take unique row for VID M1BH477200042
df = duplicates_handling(df,keep=False)
df = pd.concat([df, df_dup_withval], ignore_index=True)
df = duplicates_handling(df,keep="first")
df

In [None]:
check_duplicates(df)

In [None]:
df["SPECKLE"].value_counts()

### 2 ) NA handling (NA removal/imputation)

In [None]:
# check column for nulls
def check_colna(df):
    """
    Check each column for nulls. Returns Feature, total null and % of null for each column
    df[DataFrame]: dataframe
    
    """
    colna_df = pd.DataFrame(columns =["Feature", "Total Null", "% of Null"])
    for col in df.columns: 
        #checking if there is any null in the column

        if df[col].isnull().sum()>0: 
            
            # if null present, total number of null in the column stores here
            total_null = df[col].isnull().sum() 
            new_row = {'Feature':col, 'Total Null':total_null, '% of Null':total_null*100/len(df)}
            #append row to the dataframe
            colna_df = colna_df.append(new_row, ignore_index=True)
            
    colna_df= colna_df.sort_values("% of Null", ascending=False)    
    return colna_df  

# check rows for nulls
def check_rowna(df,supporting_fs):  
    """
    Check each row for nulls.Returns VID, total null and % of null for each row
    df[DataFrame]: dataframe
    supporting_fs[list]: all features not used for ML except VID and SPECKLE(target)
    """
    df = df.drop(supporting_fs,axis=1)    
    colrow_df = pd.DataFrame(columns =["SPECKLE","VID", "Total Null", "% of Null"])
    for i in df.index: 
        #checking if there is any null in the row
        if df.iloc[i].isnull().sum()>0:             
            # if null present, total number of null in the row stores here
            total_null = df.iloc[i].isnull().sum() 
            new_row = {'SPECKLE':df.iloc[i,0],'VID':df.iloc[i,1], 'Total Null':total_null, '% of Null':round(total_null*100/(len(df.columns)-2),2)}
            #append row to the dataframe
            colrow_df = colrow_df.append(new_row, ignore_index=True)
            
    colrow_df= colrow_df.sort_values("% of Null", ascending=False)    
    return colrow_df   

# Drop columns based on NA threshold limit
def drop_NAcol(df,NA_limit):
    '''
    Drops columns based on proportion of NA in column
    df[DataFrame]: df
    NA_limit[float/int]: Columns with proportion of NA above NA_limit will be dropped
    '''
    threshold = len(df)*(1-NA_limit)
    df=df.dropna(axis=1, thresh=threshold)
    print(df.shape)
    return df


#### a) Check column for nulls

In [None]:
print(df.shape)
print(df["SPECKLE"].value_counts())

#### i) check nulls in whole dataset

In [None]:
#check column na for whole df
colna_df =check_colna(df)
colna_df

In [None]:
#check columns with 100% NA
colna_df.loc[colna_df["% of Null"] ==100]

In [None]:
print(len(colna_df.loc[colna_df["% of Null"] ==100]))
colna_df.to_csv(dataprep_path+"NA_Cols.csv",index=False)

In [None]:
#drop columns with all NA
df = df.dropna(axis=1, how='all')
df.shape

In [None]:
check_colna(df)

#### ii) check nulls in column for speckle only

In [None]:
df.shape

In [None]:
#check column na for speckle
colna_speckle_df =check_colna(df.loc[df['SPECKLE'] == 1])
colna_speckle_df

In [None]:
len(colna_speckle_df.loc[colna_speckle_df["% of Null"] ==100])

In [None]:
colna_speckle_df.to_csv(path+"/DataPreparation/NA_Cols_speckle.csv",index=False)

In [None]:
#drop columns with all NA in the speckle data
col100na_speckle_df = colna_speckle_df.loc[colna_speckle_df["% of Null"] ==100]
df = df.loc[:,~df.columns.isin(col100na_speckle_df['Feature'])]
df.shape

In [None]:
colna_speckle_df =check_colna(df.loc[df['SPECKLE'] == 1])
colna_speckle_df 

#### iii) check nulls in column for non-speckle only

In [None]:
#check column na for non-speckle
colna_nonspeckle_df =check_colna(df.loc[df['SPECKLE'] == 0])
colna_nonspeckle_df

In [None]:
#check column na for whole df after removing columns with 100% NA in whole dataset and speckle data
colna_df_no100NA =check_colna(df)
colna_df_no100NA

In [None]:
df.shape

In [None]:
colna_df_no100NA.to_csv(path+"/DataPreparation/NA_Cols_without100NA.csv",index=False)

#### b) Drop columns based on threshold limit

In [None]:
print("No. of columns with NA>= 70%:", len(colna_df_no100NA[colna_df_no100NA["% of Null"] >= 70]))
print("No. of columns with NA>= 80%:", len(colna_df_no100NA[colna_df_no100NA["% of Null"] >= 80]))
print("No. of columns with NA>= 90%:", len(colna_df_no100NA[colna_df_no100NA["% of Null"] >= 90]))

In [None]:
df = drop_NAcol(df,0.8) #drop columns with >=80% NA 

In [None]:
df.shape

In [None]:
check_colna(df)

#### c) Check row for nulls

In [None]:
print(df.shape)
print(df["SPECKLE"].value_counts())

In [None]:
# pd.options.display.max_seq_items = 2000
# df.columns

In [None]:
supporting_fs = ['ULT@MIDAS_6261_U1', 'SORTLOT', 'SORTLOT7', 'WAFER',
       'XLOC', 'YLOC', 'ULT@MIDAS_6261_U2', 'IB@6261[CLASSHOT]',
       'FB@6261[CLASSHOT]', 'TEST RESULTS', 'TEST RESULTS BITS', 'TR_BITS',
       'INCOMING', 'INCOMING BITS', 'INC_BITS', 'OUTGOING', 'OUTGOING BITS',
       'OUT_BITS', 'DELTA']
rowna_df = check_rowna(df,supporting_fs)
rowna_df

In [None]:
rowna_df.to_csv(dataprep_path+"NA_Rows.csv",index=False)

In [None]:
#get rows with 100% NA
row100na_df = rowna_df.loc[rowna_df["% of Null"] ==100.00]
row100na_df

In [None]:
len(row100na_df)

In [None]:
row100na_df["SPECKLE"].value_counts()

#### d) Row NA handling

In [None]:
#drop rows with 100% NA (~37k rows)
df = df[~df["VID"].isin(row100na_df["VID"])]
df = df.reset_index(drop=True)
df

In [None]:
df["SPECKLE"].value_counts()

In [None]:
df.shape

In [None]:
#check row na
supporting_fs = ['ULT@MIDAS_6261_U1', 'SORTLOT', 'SORTLOT7', 'WAFER',
       'XLOC', 'YLOC', 'ULT@MIDAS_6261_U2', 'IB@6261[CLASSHOT]',
       'FB@6261[CLASSHOT]', 'TEST RESULTS', 'TEST RESULTS BITS', 'TR_BITS',
       'INCOMING', 'INCOMING BITS', 'INC_BITS', 'OUTGOING', 'OUTGOING BITS',
       'OUT_BITS', 'DELTA']
check_rowna(df,supporting_fs)

In [None]:
# df = df.drop(['PTH_POWER::POWER_X_SCREEN_E_BEGIN_X_X_X_X_CALC_PP_CDYN_INDICATOR_CDYN_DATA@132110'],axis=1)

In [None]:
df.shape

In [None]:
#NA imputation
def NA_impute(df,imptype):
    """
    Impute NA
    df[DataFrame]:df
    imptype[string]: "mean" to impute data with mean, "median" to impute data with median
    """
    if imptype == "mean":        
        df = df.fillna(df.mean())
    if imptype == "median":
        df = df.fillna(df.median())
    return df

In [None]:
df = NA_impute(df,imptype="median")
df

In [None]:
check_rowna(df,supporting_fs) #all NA imputed with median

In [None]:
df.to_csv(dataprep_path+"df_blanksrem_NAhandled.csv",index=False)

In [None]:
df.shape

### 3) Handling of negative values (Imputation/conversion)

In [1]:
import pandas as pd
path = "C:/Users/ppirthip/OneDrive - Intel Corporation/Speckle/"
df = pd.read_csv(path+"df_blanksrem_NAhandled.csv")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [2]:
df.shape

(15797, 1549)

### a) Converting negative to positive values

In [3]:
### Create df_keep -> Dataset with supporting features, IDV and HVQK token family columns only  
#df with supporting fs
supporting_fs_df = df[["ULT@MIDAS_6261_U1", "SORTLOT", "SORTLOT7", "WAFER",
        "XLOC", "YLOC", "ULT@MIDAS_6261_U2", "IB@6261[CLASSHOT]",
        "FB@6261[CLASSHOT]","TEST RESULTS", "TEST RESULTS BITS", "TR_BITS", "INCOMING",
        "INCOMING BITS", "INC_BITS", "OUTGOING", "OUTGOING BITS", "OUT_BITS","DELTA","VID"]]
#df with IDV and HVQK tokens
cols_to_keep=["IDV", "HVQK"]
IDV_HVQK_df = df[df.columns[df.columns.str.startswith(tuple(cols_to_keep))]]

#Concatenating supporting_fs_df and IDV_HVQK_df along columns
df_keep = pd.concat([supporting_fs_df, IDV_HVQK_df], axis=1)
df_keep.head()

Unnamed: 0,ULT@MIDAS_6261_U1,SORTLOT,SORTLOT7,WAFER,XLOC,YLOC,ULT@MIDAS_6261_U2,IB@6261[CLASSHOT],FB@6261[CLASSHOT],TEST RESULTS,...,HVQK_VMIN_PRE_SCAN_UNCORE_LLCSFP02@132110,HVQK_VMIN_PRE_SCAN_UNCORE_LLCSFP11@132110,HVQK_VMIN_PRE_SCAN_UNCORE_LLCSFP12@132110,HVQK_VMIN_PRE_SCAN_UNCORE_LLCSFP21@132110,HVQK_VMIN_PRE_SCAN_UNCORE_LLCSFP23@132110,HVQK_VMIN_PRE_SCAN_UNCORE_MISC@132110,HVQK_VMIN_PRE_SCAN_UNCORE_PCIE@132110,HVQK_VMIN_PRE_SCAN_UNCORE_RLINK@132110,HVQK_VMIN_PRE_SCAN_UNCORE_SA@132110,HVQK_VMIN_PRE_SCAN_UNCORE_VNN@132110
0,N0293010_594_1_9,N0293010,N029301,594,1,9,H0235270_643_8_13,1,101,B101000,...,0.6101,0.6121,0.6095,0.6136,0.6,0.8164,0.7452,0.62,0.7822,0.6647
1,N0293010_594_3_3,N0293010,N029301,594,3,3,H0235270_643_11_8,1,101,B000100,...,0.6311,0.6332,0.6306,0.6547,0.6233,0.838,0.7467,0.6313,0.7836,0.6859
2,N0293010_594_-3_-5,N0293010,N029301,594,-3,-5,H0235270_643_11_13,1,101,B101000,...,0.6195,0.6211,0.6188,0.6223,0.6119,0.7995,0.7295,0.6,0.7878,0.6729
3,N0293010_594_5_-3,N0293010,N029301,594,5,-3,H0235270_643_11_6,14,1451,B010100,...,0.6381,-5555.0,0.6374,0.6408,0.6305,0.8173,0.7274,0.6186,0.7859,0.6721
4,N0293010_585_-6_-3,N0293010,N029301,585,-6,-3,H0235270_643_14_5,1,101,B001010,...,0.5918,0.593,0.5909,0.5941,-5555.0,0.7872,0.698,0.6,0.7373,0.6721


In [4]:
# Create df_to_convert - drop columns with supporting features, IDV and HVQK token family columns
df_to_convert = df.drop([col for col in df if col.startswith('IDV')], axis=1)
df_to_convert = df_to_convert.drop([col for col in df if col.startswith('HVQK')], axis=1)
df_to_convert = df_to_convert.drop(["ULT@MIDAS_6261_U1", "SORTLOT", "SORTLOT7", "WAFER",
        "XLOC", "YLOC", "ULT@MIDAS_6261_U2", "IB@6261[CLASSHOT]",
        "FB@6261[CLASSHOT]","TEST RESULTS", "TEST RESULTS BITS", "TR_BITS", "INCOMING",
        "INCOMING BITS", "INC_BITS", "OUTGOING", "OUTGOING BITS", "OUT_BITS","DELTA","VID"], axis=1)
df_to_convert.head()

Unnamed: 0,SPECKLE,TPI_SIU_STATIC::SIU_STATIC_AM_K_START_X_X_X_SNR_VNOM_VCCINF_1P05_SIUP_SICC_VCCADLLOPI_V1@132110,TPI_SIU_STATIC::SIU_STATIC_AM_K_START_X_X_X_SNR_VNOM_VCCINF_1P05_SIUP_SICC_VCCADLLOPI_V1@132150,TPI_SIU_STATIC::SIU_STATIC_AM_K_START_X_X_X_SNR_VNOM_VCCINF_1P05_SIUP_SICC_VCCAGSH_CLM_EHV_V1@132110,TPI_SIU_STATIC::SIU_STATIC_AM_K_START_X_X_X_SNR_VNOM_VCCINF_1P05_SIUP_SICC_VCCAGSH_CLM_EHV_V1@132150,TPI_SIU_STATIC::SIU_STATIC_AM_K_START_X_X_X_SNR_VNOM_VCCINF_1P05_SIUP_SICC_VCCAOPI_V1@132110,TPI_SIU_STATIC::SIU_STATIC_AM_K_START_X_X_X_SNR_VNOM_VCCINF_1P05_SIUP_SICC_VCCAOPI_V1@132150,TPI_SIU_STATIC::SIU_STATIC_AM_K_START_X_X_X_SNR_VNOM_VCCINF_1P05_SIUP_SICC_VCCAPLLOPI_FAD_DIGICKSI0_V1@132110,TPI_SIU_STATIC::SIU_STATIC_AM_K_START_X_X_X_SNR_VNOM_VCCINF_1P05_SIUP_SICC_VCCAPLLOPI_FAD_DIGICKSI0_V1@132150,TPI_SIU_STATIC::SIU_STATIC_AM_K_START_X_X_X_SNR_VNOM_VCCINF_1P05_SIUP_SICC_VCCCORE_M01_V1@132110,...,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDCLK1,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDDATA0,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDDATA1,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TAP_DIS_STRAP_N,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TAP_ODT_EN,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TXT_AGENT,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TXT_PLTEN,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VPPSMBUSSCL,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VPPSMBUSSDA,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VSENSEPMAX
0,1,0.034216,0.042544,0.01903,0.019927,0.012756,0.052938,0.005302,0.015012,0.021843,...,-7.9e-07,-7.83e-07,-8.27e-07,-1.8e-07,-2.04e-07,-1.95e-07,-2.24e-07,-7.78e-07,-7.55e-07,-1.72e-07
1,1,0.048602,0.052342,0.017898,0.019097,0.011125,0.062062,0.003939,0.013912,0.057196,...,-1.2e-06,-1.2e-06,-1.22e-06,-2.89e-07,-2.99e-07,-3.08e-07,-3.02e-07,-1.2e-06,-1.2e-06,-3.31e-07
2,1,0.048412,0.051455,0.017952,0.018987,0.011044,0.050722,0.002593,0.009865,0.047841,...,-9.39e-07,-9.47e-07,-9.39e-07,-2.35e-07,-2.28e-07,-2.4e-07,-2.29e-07,-9.43e-07,-9.53e-07,-2.49e-07
3,1,0.048777,0.053612,0.019095,0.020753,0.012835,0.0866,0.002668,0.01301,0.091913,...,-1.11e-06,-1.21e-06,-1.24e-06,-2.83e-07,-2.98e-07,-3.09e-07,-2.9e-07,-1.21e-06,-1.13e-06,-2.99e-07
4,1,0.036453,0.043621,0.019249,0.020645,0.02237,0.079369,0.005981,0.011016,0.060855,...,-6.38e-07,-6.86e-07,-6.6e-07,-1.56e-07,-1.7e-07,-1.63e-07,-1.47e-07,-6.33e-07,-6.49e-07,-1.5e-07


In [5]:
#Convert negative columns in df_to_convert to positive
df_positive = df_to_convert.abs()
df_positive.head()

Unnamed: 0,SPECKLE,TPI_SIU_STATIC::SIU_STATIC_AM_K_START_X_X_X_SNR_VNOM_VCCINF_1P05_SIUP_SICC_VCCADLLOPI_V1@132110,TPI_SIU_STATIC::SIU_STATIC_AM_K_START_X_X_X_SNR_VNOM_VCCINF_1P05_SIUP_SICC_VCCADLLOPI_V1@132150,TPI_SIU_STATIC::SIU_STATIC_AM_K_START_X_X_X_SNR_VNOM_VCCINF_1P05_SIUP_SICC_VCCAGSH_CLM_EHV_V1@132110,TPI_SIU_STATIC::SIU_STATIC_AM_K_START_X_X_X_SNR_VNOM_VCCINF_1P05_SIUP_SICC_VCCAGSH_CLM_EHV_V1@132150,TPI_SIU_STATIC::SIU_STATIC_AM_K_START_X_X_X_SNR_VNOM_VCCINF_1P05_SIUP_SICC_VCCAOPI_V1@132110,TPI_SIU_STATIC::SIU_STATIC_AM_K_START_X_X_X_SNR_VNOM_VCCINF_1P05_SIUP_SICC_VCCAOPI_V1@132150,TPI_SIU_STATIC::SIU_STATIC_AM_K_START_X_X_X_SNR_VNOM_VCCINF_1P05_SIUP_SICC_VCCAPLLOPI_FAD_DIGICKSI0_V1@132110,TPI_SIU_STATIC::SIU_STATIC_AM_K_START_X_X_X_SNR_VNOM_VCCINF_1P05_SIUP_SICC_VCCAPLLOPI_FAD_DIGICKSI0_V1@132150,TPI_SIU_STATIC::SIU_STATIC_AM_K_START_X_X_X_SNR_VNOM_VCCINF_1P05_SIUP_SICC_VCCCORE_M01_V1@132110,...,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDCLK1,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDDATA0,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDDATA1,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TAP_DIS_STRAP_N,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TAP_ODT_EN,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TXT_AGENT,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TXT_PLTEN,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VPPSMBUSSCL,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VPPSMBUSSDA,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VSENSEPMAX
0,1,0.034216,0.042544,0.01903,0.019927,0.012756,0.052938,0.005302,0.015012,0.021843,...,7.9e-07,7.83e-07,8.27e-07,1.8e-07,2.04e-07,1.95e-07,2.24e-07,7.78e-07,7.55e-07,1.72e-07
1,1,0.048602,0.052342,0.017898,0.019097,0.011125,0.062062,0.003939,0.013912,0.057196,...,1.2e-06,1.2e-06,1.22e-06,2.89e-07,2.99e-07,3.08e-07,3.02e-07,1.2e-06,1.2e-06,3.31e-07
2,1,0.048412,0.051455,0.017952,0.018987,0.011044,0.050722,0.002593,0.009865,0.047841,...,9.39e-07,9.47e-07,9.39e-07,2.35e-07,2.28e-07,2.4e-07,2.29e-07,9.43e-07,9.53e-07,2.49e-07
3,1,0.048777,0.053612,0.019095,0.020753,0.012835,0.0866,0.002668,0.01301,0.091913,...,1.11e-06,1.21e-06,1.24e-06,2.83e-07,2.98e-07,3.09e-07,2.9e-07,1.21e-06,1.13e-06,2.99e-07
4,1,0.036453,0.043621,0.019249,0.020645,0.02237,0.079369,0.005981,0.011016,0.060855,...,6.38e-07,6.86e-07,6.6e-07,1.56e-07,1.7e-07,1.63e-07,1.47e-07,6.33e-07,6.49e-07,1.5e-07


In [6]:
#check whether there's any negative value left in df_positive
(df_positive < 0).any().any()

False

In [7]:
#Concatenating df_keep and df_positive along columns
df = pd.concat([df_keep, df_positive], axis=1)
df.shape

(15797, 1549)

### b) Imputing negative values

In [8]:
#Checking column for negative values
def check_col_negative(df,value):
    """
    Check each column for negative values. Returns Feature, total negative values and % of negative values for each column
    df[DataFrame]: dataframe
    
    """
    col_negative_df = pd.DataFrame(columns =["Feature", "Total Negative Values", "% of Negative Values"])
    for col in df.columns: 
        #checking if there is any specific negative value in the column

        if df[col].isin([value]).sum()>0: 
            
            # if specific negative value present, total number of specific negative value in the column stores here
            total_negative = df[col].isin([value]).sum() 
            new_row = {'Feature':col, 'Total Negative Values':total_negative, '% of Negative Values':round(total_negative*100/len(df),2)}
            #append row to the dataframe
            col_negative_df = col_negative_df.append(new_row, ignore_index=True)
            
    col_negative_df = col_negative_df.sort_values("% of Negative Values", ascending=False)    
    return col_negative_df  

In [9]:
#Checking column for -5555 values
value = -5555
check_col_negative(df,value)

Unnamed: 0,Feature,Total Negative Values,% of Negative Values
5,HVQK_VMIN_POST_ARRAY_NAC_MBIST_LSACPMCPR@132110,2258,14.29
43,HVQK_VMIN_POST_SCAN_NAC_CPMC@132110,2258,14.29
67,HVQK_VMIN_PRE_SCAN_NAC_CPMC@132110,1715,10.86
6,HVQK_VMIN_POST_ARRAY_NAC_MBIST_LSACPMPKE@132110,1064,6.74
42,HVQK_VMIN_POST_SCAN_NAC_CPMB@132110,1064,6.74
...,...,...,...
70,HVQK_VMIN_PRE_SCAN_UNCORE_LLCSFP11@132110,356,2.25
72,HVQK_VMIN_PRE_SCAN_UNCORE_LLCSFP21@132110,326,2.06
71,HVQK_VMIN_PRE_SCAN_UNCORE_LLCSFP12@132110,318,2.01
69,HVQK_VMIN_PRE_SCAN_UNCORE_LLCSFP02@132110,312,1.98


In [10]:
#Checking column for -999 values
value = -999
check_col_negative(df,value)

Unnamed: 0,Feature,Total Negative Values,% of Negative Values
0,IDV_0101_SVT3GNES12_FULLDIE_CORE_TALL_0650_MED...,870,5.51
37,IDV_0147_NOMNLK12_FULLDIE_CORE_TALL_0650_MED@1...,870,5.51
27,IDV_0137_PNOM3GVTO12_FULLDIE_CORE_TALL_0650_ME...,870,5.51
28,IDV_0138_PNOM3GVTO12_FULLDIE_CORE_TALL_0650_ME...,870,5.51
29,IDV_0139_NSVT3GVTO12_FULLDIE_CORE_TALL_0650_ME...,870,5.51
30,IDV_0140_NSVT3GVTO12_FULLDIE_CORE_TALL_0650_ME...,870,5.51
31,IDV_0141_NSVT3GVTO12_FULLDIE_CORE_TALL_0650_ME...,870,5.51
32,IDV_0142_NSVT3GVTO12_FULLDIE_CORE_TALL_0650_ME...,870,5.51
33,IDV_0143_NNOM3GVTO12_FULLDIE_CORE_TALL_0650_ME...,870,5.51
34,IDV_0144_NNOM3GVTO12_FULLDIE_CORE_TALL_0650_ME...,870,5.51


In [11]:
#Checking column for -999 values
value = -999
check_col_negative(df,value)

Unnamed: 0,Feature,Total Negative Values,% of Negative Values
0,IDV_0101_SVT3GNES12_FULLDIE_CORE_TALL_0650_MED...,870,5.51
37,IDV_0147_NOMNLK12_FULLDIE_CORE_TALL_0650_MED@1...,870,5.51
27,IDV_0137_PNOM3GVTO12_FULLDIE_CORE_TALL_0650_ME...,870,5.51
28,IDV_0138_PNOM3GVTO12_FULLDIE_CORE_TALL_0650_ME...,870,5.51
29,IDV_0139_NSVT3GVTO12_FULLDIE_CORE_TALL_0650_ME...,870,5.51
30,IDV_0140_NSVT3GVTO12_FULLDIE_CORE_TALL_0650_ME...,870,5.51
31,IDV_0141_NSVT3GVTO12_FULLDIE_CORE_TALL_0650_ME...,870,5.51
32,IDV_0142_NSVT3GVTO12_FULLDIE_CORE_TALL_0650_ME...,870,5.51
33,IDV_0143_NNOM3GVTO12_FULLDIE_CORE_TALL_0650_ME...,870,5.51
34,IDV_0144_NNOM3GVTO12_FULLDIE_CORE_TALL_0650_ME...,870,5.51


In [12]:
#Checking column for -9999 values
value = -9999
check_col_negative(df,value)

Unnamed: 0,Feature,Total Negative Values,% of Negative Values
0,HVQK_VMIN_PRE_ARRAY_NAC_MBIST_LSACPMCPR@132110,1037,6.56
3,HVQK_VMIN_PRE_SCAN_NAC_CPMC@132110,318,2.01
2,HVQK_VMIN_PRE_SCAN_NAC_CPMB@132110,145,0.92
1,HVQK_VMIN_PRE_ARRAY_NAC_MBIST_LSACPMPKE@132110,94,0.6


In [13]:
#Checking column for -555 values
value = -555
check_col_negative(df,value)

Unnamed: 0,Feature,Total Negative Values,% of Negative Values


In [14]:
#Negative value imputation
def Negative_value_impute(df,value,imptype):
    """
    Impute Negative value (can choose negative value to impute)
    df[DataFrame]:df
    imptype[string]: "mean" to impute data with mean, "median" to impute data with median
    """
    if imptype == "mean":        
        df = df.replace(value,df.mean())
    if imptype == "median":
        df = df.replace(value,df.median())
    return df

In [15]:
#Negative -5555 imputation with median
df = Negative_value_impute(df,value=-5555,imptype="median")
df

Unnamed: 0,ULT@MIDAS_6261_U1,SORTLOT,SORTLOT7,WAFER,XLOC,YLOC,ULT@MIDAS_6261_U2,IB@6261[CLASSHOT],FB@6261[CLASSHOT],TEST RESULTS,...,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDCLK1,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDDATA0,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDDATA1,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TAP_DIS_STRAP_N,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TAP_ODT_EN,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TXT_AGENT,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TXT_PLTEN,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VPPSMBUSSCL,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VPPSMBUSSDA,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VSENSEPMAX
0,N0293010_594_1_9,N0293010,N029301,594,1,9,H0235270_643_8_13,1,101,B101000,...,7.900000e-07,7.830000e-07,8.270000e-07,1.800000e-07,2.040000e-07,1.950000e-07,2.240000e-07,7.780000e-07,7.550000e-07,1.720000e-07
1,N0293010_594_3_3,N0293010,N029301,594,3,3,H0235270_643_11_8,1,101,B000100,...,1.200000e-06,1.200000e-06,1.220000e-06,2.890000e-07,2.990000e-07,3.080000e-07,3.020000e-07,1.200000e-06,1.200000e-06,3.310000e-07
2,N0293010_594_-3_-5,N0293010,N029301,594,-3,-5,H0235270_643_11_13,1,101,B101000,...,9.390000e-07,9.470000e-07,9.390000e-07,2.350000e-07,2.280000e-07,2.400000e-07,2.290000e-07,9.430000e-07,9.530000e-07,2.490000e-07
3,N0293010_594_5_-3,N0293010,N029301,594,5,-3,H0235270_643_11_6,14,1451,B010100,...,1.110000e-06,1.210000e-06,1.240000e-06,2.830000e-07,2.980000e-07,3.090000e-07,2.900000e-07,1.210000e-06,1.130000e-06,2.990000e-07
4,N0293010_585_-6_-3,N0293010,N029301,585,-6,-3,H0235270_643_14_5,1,101,B001010,...,6.380000e-07,6.860000e-07,6.600000e-07,1.560000e-07,1.700000e-07,1.630000e-07,1.470000e-07,6.330000e-07,6.490000e-07,1.500000e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15792,N0422200_050_-3_1,N0422200,N042220,50,-3,1,H0311010_568_4_-11,1,101,0,...,7.840000e-07,7.520000e-07,7.590000e-07,1.610000e-07,1.760000e-07,1.860000e-07,1.970000e-07,7.280000e-07,7.510000e-07,1.860000e-07
15793,N0422200_050_0_4,N0422200,N042220,50,0,4,H0311010_568_4_-8,1,101,0,...,8.900000e-07,8.520000e-07,8.660000e-07,2.130000e-07,2.040000e-07,2.200000e-07,2.290000e-07,8.970000e-07,9.040000e-07,2.050000e-07
15794,N0422200_050_0_5,N0422200,N042220,50,0,5,H0311010_568_4_-9,1,101,0,...,7.590000e-07,7.500000e-07,7.660000e-07,1.910000e-07,1.810000e-07,1.960000e-07,1.950000e-07,8.120000e-07,8.170000e-07,1.990000e-07
15795,N0422200_050_-3_0,N0422200,N042220,50,-3,0,H0311010_568_4_-12,1,101,0,...,8.870000e-07,8.530000e-07,8.720000e-07,2.170000e-07,1.960000e-07,2.090000e-07,2.180000e-07,8.270000e-07,8.250000e-07,2.320000e-07


In [16]:
#Negative 999 imputation with median
df = Negative_value_impute(df,value=-999,imptype="median")
df

Unnamed: 0,ULT@MIDAS_6261_U1,SORTLOT,SORTLOT7,WAFER,XLOC,YLOC,ULT@MIDAS_6261_U2,IB@6261[CLASSHOT],FB@6261[CLASSHOT],TEST RESULTS,...,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDCLK1,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDDATA0,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDDATA1,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TAP_DIS_STRAP_N,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TAP_ODT_EN,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TXT_AGENT,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TXT_PLTEN,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VPPSMBUSSCL,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VPPSMBUSSDA,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VSENSEPMAX
0,N0293010_594_1_9,N0293010,N029301,594,1,9,H0235270_643_8_13,1,101,B101000,...,7.900000e-07,7.830000e-07,8.270000e-07,1.800000e-07,2.040000e-07,1.950000e-07,2.240000e-07,7.780000e-07,7.550000e-07,1.720000e-07
1,N0293010_594_3_3,N0293010,N029301,594,3,3,H0235270_643_11_8,1,101,B000100,...,1.200000e-06,1.200000e-06,1.220000e-06,2.890000e-07,2.990000e-07,3.080000e-07,3.020000e-07,1.200000e-06,1.200000e-06,3.310000e-07
2,N0293010_594_-3_-5,N0293010,N029301,594,-3,-5,H0235270_643_11_13,1,101,B101000,...,9.390000e-07,9.470000e-07,9.390000e-07,2.350000e-07,2.280000e-07,2.400000e-07,2.290000e-07,9.430000e-07,9.530000e-07,2.490000e-07
3,N0293010_594_5_-3,N0293010,N029301,594,5,-3,H0235270_643_11_6,14,1451,B010100,...,1.110000e-06,1.210000e-06,1.240000e-06,2.830000e-07,2.980000e-07,3.090000e-07,2.900000e-07,1.210000e-06,1.130000e-06,2.990000e-07
4,N0293010_585_-6_-3,N0293010,N029301,585,-6,-3,H0235270_643_14_5,1,101,B001010,...,6.380000e-07,6.860000e-07,6.600000e-07,1.560000e-07,1.700000e-07,1.630000e-07,1.470000e-07,6.330000e-07,6.490000e-07,1.500000e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15792,N0422200_050_-3_1,N0422200,N042220,50,-3,1,H0311010_568_4_-11,1,101,0,...,7.840000e-07,7.520000e-07,7.590000e-07,1.610000e-07,1.760000e-07,1.860000e-07,1.970000e-07,7.280000e-07,7.510000e-07,1.860000e-07
15793,N0422200_050_0_4,N0422200,N042220,50,0,4,H0311010_568_4_-8,1,101,0,...,8.900000e-07,8.520000e-07,8.660000e-07,2.130000e-07,2.040000e-07,2.200000e-07,2.290000e-07,8.970000e-07,9.040000e-07,2.050000e-07
15794,N0422200_050_0_5,N0422200,N042220,50,0,5,H0311010_568_4_-9,1,101,0,...,7.590000e-07,7.500000e-07,7.660000e-07,1.910000e-07,1.810000e-07,1.960000e-07,1.950000e-07,8.120000e-07,8.170000e-07,1.990000e-07
15795,N0422200_050_-3_0,N0422200,N042220,50,-3,0,H0311010_568_4_-12,1,101,0,...,8.870000e-07,8.530000e-07,8.720000e-07,2.170000e-07,1.960000e-07,2.090000e-07,2.180000e-07,8.270000e-07,8.250000e-07,2.320000e-07


In [17]:
#Negative 9999 imputation with median
df = Negative_value_impute(df,value=-9999,imptype="median")
df

Unnamed: 0,ULT@MIDAS_6261_U1,SORTLOT,SORTLOT7,WAFER,XLOC,YLOC,ULT@MIDAS_6261_U2,IB@6261[CLASSHOT],FB@6261[CLASSHOT],TEST RESULTS,...,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDCLK1,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDDATA0,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDDATA1,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TAP_DIS_STRAP_N,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TAP_ODT_EN,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TXT_AGENT,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TXT_PLTEN,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VPPSMBUSSCL,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VPPSMBUSSDA,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VSENSEPMAX
0,N0293010_594_1_9,N0293010,N029301,594,1,9,H0235270_643_8_13,1,101,B101000,...,7.900000e-07,7.830000e-07,8.270000e-07,1.800000e-07,2.040000e-07,1.950000e-07,2.240000e-07,7.780000e-07,7.550000e-07,1.720000e-07
1,N0293010_594_3_3,N0293010,N029301,594,3,3,H0235270_643_11_8,1,101,B000100,...,1.200000e-06,1.200000e-06,1.220000e-06,2.890000e-07,2.990000e-07,3.080000e-07,3.020000e-07,1.200000e-06,1.200000e-06,3.310000e-07
2,N0293010_594_-3_-5,N0293010,N029301,594,-3,-5,H0235270_643_11_13,1,101,B101000,...,9.390000e-07,9.470000e-07,9.390000e-07,2.350000e-07,2.280000e-07,2.400000e-07,2.290000e-07,9.430000e-07,9.530000e-07,2.490000e-07
3,N0293010_594_5_-3,N0293010,N029301,594,5,-3,H0235270_643_11_6,14,1451,B010100,...,1.110000e-06,1.210000e-06,1.240000e-06,2.830000e-07,2.980000e-07,3.090000e-07,2.900000e-07,1.210000e-06,1.130000e-06,2.990000e-07
4,N0293010_585_-6_-3,N0293010,N029301,585,-6,-3,H0235270_643_14_5,1,101,B001010,...,6.380000e-07,6.860000e-07,6.600000e-07,1.560000e-07,1.700000e-07,1.630000e-07,1.470000e-07,6.330000e-07,6.490000e-07,1.500000e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15792,N0422200_050_-3_1,N0422200,N042220,50,-3,1,H0311010_568_4_-11,1,101,0,...,7.840000e-07,7.520000e-07,7.590000e-07,1.610000e-07,1.760000e-07,1.860000e-07,1.970000e-07,7.280000e-07,7.510000e-07,1.860000e-07
15793,N0422200_050_0_4,N0422200,N042220,50,0,4,H0311010_568_4_-8,1,101,0,...,8.900000e-07,8.520000e-07,8.660000e-07,2.130000e-07,2.040000e-07,2.200000e-07,2.290000e-07,8.970000e-07,9.040000e-07,2.050000e-07
15794,N0422200_050_0_5,N0422200,N042220,50,0,5,H0311010_568_4_-9,1,101,0,...,7.590000e-07,7.500000e-07,7.660000e-07,1.910000e-07,1.810000e-07,1.960000e-07,1.950000e-07,8.120000e-07,8.170000e-07,1.990000e-07
15795,N0422200_050_-3_0,N0422200,N042220,50,-3,0,H0311010_568_4_-12,1,101,0,...,8.870000e-07,8.530000e-07,8.720000e-07,2.170000e-07,1.960000e-07,2.090000e-07,2.180000e-07,8.270000e-07,8.250000e-07,2.320000e-07


In [18]:
#Negative -555 imputation with median
df = Negative_value_impute(df,value=-555,imptype="median")
df

Unnamed: 0,ULT@MIDAS_6261_U1,SORTLOT,SORTLOT7,WAFER,XLOC,YLOC,ULT@MIDAS_6261_U2,IB@6261[CLASSHOT],FB@6261[CLASSHOT],TEST RESULTS,...,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDCLK1,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDDATA0,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_SVIDDATA1,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TAP_DIS_STRAP_N,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TAP_ODT_EN,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TXT_AGENT,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_TXT_PLTEN,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VPPSMBUSSCL,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VPPSMBUSSDA,SIO_LKG_X::LO1P05_X_LKG_K_END_X_X_VMAX_X_1MA_PC1@132110_VSENSEPMAX
0,N0293010_594_1_9,N0293010,N029301,594,1,9,H0235270_643_8_13,1,101,B101000,...,7.900000e-07,7.830000e-07,8.270000e-07,1.800000e-07,2.040000e-07,1.950000e-07,2.240000e-07,7.780000e-07,7.550000e-07,1.720000e-07
1,N0293010_594_3_3,N0293010,N029301,594,3,3,H0235270_643_11_8,1,101,B000100,...,1.200000e-06,1.200000e-06,1.220000e-06,2.890000e-07,2.990000e-07,3.080000e-07,3.020000e-07,1.200000e-06,1.200000e-06,3.310000e-07
2,N0293010_594_-3_-5,N0293010,N029301,594,-3,-5,H0235270_643_11_13,1,101,B101000,...,9.390000e-07,9.470000e-07,9.390000e-07,2.350000e-07,2.280000e-07,2.400000e-07,2.290000e-07,9.430000e-07,9.530000e-07,2.490000e-07
3,N0293010_594_5_-3,N0293010,N029301,594,5,-3,H0235270_643_11_6,14,1451,B010100,...,1.110000e-06,1.210000e-06,1.240000e-06,2.830000e-07,2.980000e-07,3.090000e-07,2.900000e-07,1.210000e-06,1.130000e-06,2.990000e-07
4,N0293010_585_-6_-3,N0293010,N029301,585,-6,-3,H0235270_643_14_5,1,101,B001010,...,6.380000e-07,6.860000e-07,6.600000e-07,1.560000e-07,1.700000e-07,1.630000e-07,1.470000e-07,6.330000e-07,6.490000e-07,1.500000e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15792,N0422200_050_-3_1,N0422200,N042220,50,-3,1,H0311010_568_4_-11,1,101,0,...,7.840000e-07,7.520000e-07,7.590000e-07,1.610000e-07,1.760000e-07,1.860000e-07,1.970000e-07,7.280000e-07,7.510000e-07,1.860000e-07
15793,N0422200_050_0_4,N0422200,N042220,50,0,4,H0311010_568_4_-8,1,101,0,...,8.900000e-07,8.520000e-07,8.660000e-07,2.130000e-07,2.040000e-07,2.200000e-07,2.290000e-07,8.970000e-07,9.040000e-07,2.050000e-07
15794,N0422200_050_0_5,N0422200,N042220,50,0,5,H0311010_568_4_-9,1,101,0,...,7.590000e-07,7.500000e-07,7.660000e-07,1.910000e-07,1.810000e-07,1.960000e-07,1.950000e-07,8.120000e-07,8.170000e-07,1.990000e-07
15795,N0422200_050_-3_0,N0422200,N042220,50,-3,0,H0311010_568_4_-12,1,101,0,...,8.870000e-07,8.530000e-07,8.720000e-07,2.170000e-07,1.960000e-07,2.090000e-07,2.180000e-07,8.270000e-07,8.250000e-07,2.320000e-07


In [19]:
value=-5555 #insert negative value to check 
check_col_negative(df,value) #all negative values imputed with median

Unnamed: 0,Feature,Total Negative Values,% of Negative Values


In [20]:
df.to_csv(path+"df_NegativeValueHandled.csv",index=False)

In [21]:
#To check if all negative values have been converted
#Filter only numeric data
df_numeric = df._get_numeric_data()
#Filter columns with negative values
df_negative = df_numeric.loc[:,(df_numeric < 0).any()]
df_negative

Unnamed: 0,XLOC,YLOC
0,1,9
1,3,3
2,-3,-5
3,5,-3
4,-6,-3
...,...,...
15792,-3,1
15793,0,4
15794,0,5
15795,-3,0


### 4) Check and remove unary(single value) columns

In [None]:
import pandas as pd
path = "C:/Users/ppirthip/OneDrive - Intel Corporation/Speckle/"
df = pd.read_csv(path+"df_NegativeValueHandled.csv")

In [None]:
def unary(df):
    """
    Checks for unary columns. If there are unary columns, the unary columns will be printed and removed from the df
    df[DataFrame]: input dataframe
    """
    unarycolumns = [col for col in df.columns if len(df[col].unique())==1]
    if unarycolumns:
        print("The unary column are:",unarycolumns)
        df = df.drop(unarycolumns,axis=1)
        print("Unary columns dropped!")
        return df 
    else:
        print("There are no unary columns!")

In [None]:
unary(df)

In [None]:
# supporting_fs = ['SPECKLE','VID','ULT@MIDAS_6261_U1', 'SORTLOT', 'SORTLOT7', 'WAFER',
#        'XLOC', 'YLOC', 'ULT@MIDAS_6261_U2', 'IB@6261[CLASSHOT]',
#        'FB@6261[CLASSHOT]','ARR_SPECKLE_SCREEN::PHYTON_X_USERFUNC_K_END_X_X_X_X_PBISTSPECKLE_INDICATOR::DUTCPUBAD_1@6261[CLASSHOT]',
#        'TEST RESULTS', 'TEST RESULTS BITS', 'TR_BITS', 'INCOMING',
#        'INCOMING BITS', 'INC_BITS', 'OUTGOING', 'OUTGOING BITS', 'OUT_BITS','DELTA']
# sf_df = df[supporting_fs]
# df1 = df.drop(supporting_fs,axis=1) 
# df2 = df1.drop(['PTH_POWER::POWER_X_SCREEN_E_BEGIN_X_X_X_X_CALC_PP_CDYN_INDICATOR_CDYN_DATA@132110'],axis=1)
# df_neg = df2.loc[:,((df2 == -555).any()) | ((df2 == -999).any())]
# df_neg_sf = pd.concat([sf_df,df_neg],axis=1)
# df_neg_sf

In [None]:
# df1.columns.difference(df2.columns)

In [None]:
# df1['PTH_POWER::POWER_X_SCREEN_E_BEGIN_X_X_X_X_CALC_PP_CDYN_INDICATOR_CDYN_DATA@132110']

In [None]:
# df2 = df1.drop(['PTH_POWER::POWER_X_SCREEN_E_BEGIN_X_X_X_X_CALC_PP_CDYN_INDICATOR_CDYN_DATA@132110'],axis=1)
# df_neg = df2.loc[:,((df2 == -555).any()) | ((df2 == -999).any())]

In [None]:
# df_neg_sf = pd.concat([sf_df,df_neg],axis=1)
# df_neg_sf

In [None]:
# df_neg_sf.to_csv(path+"/DataPreparation/Negative_Cols_555_999.csv",index=False)

### 5) Split data into train and validation datasets 

In [None]:
from sklearn.model_selection import train_test_split
from collections import Counter
def randomsamp(df,val_size):
    """
    Split whole dataset into train and validation using random sampling
    Returns X_train, X_val, y_train, y_val
    df[DataFrame]: input dataframe    
    val_size[float]:Should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the val split. 
                    Train size is complement of val size
    """
    X= df.drop(["SPECKLE"],axis=1)
    y= df["SPECKLE"] 
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = val_size,random_state=42,stratify= df["SPECKLE"])
    if type(val_size)==float:
        print("Train-val split completed with",(1-val_size)*100,"-",val_size*100,"split in train-val")
    print("Shape of X_train is:", X_train.shape)
    print("Shape of X_val is:",X_val.shape)
    print("Shape of y_train is:",y_train.shape)
    print("Shape of y_val is:",y_val.shape)
    print("Distribution of y_train:",Counter(y_train))
    print("Distribution of y_val:",Counter(y_val))
    
    X_train = X_train.reset_index(drop=True)
    X_val = X_val.reset_index(drop=True)
    y_train = y_train.reset_index(drop=True)
    y_val = y_val.reset_index(drop=True)        
           
    return X_train, X_val, y_train, y_val

def targetrandomsamp(df,speckle_test_size,nonspeckle_test_size):
    """
    Dataset is split into speckle/non-speckle first. The speckle/non-speckle datasets are then split 
    into train and validation using random sampling, followed by merging of the speckle/non-speckle  
    to return X_train, X_val, y_train, y_val
    
    df[DataFrame]: input dataframe  
    
    speckle_test_size[float/int]: Val size for speckle 
    -If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the val split. 
    -If int, represents the absolute number of val samples.
    -Train size is complement of val size
    
    nonspeckle_test_size[float/int]: Val size for non-speckle 
    -If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the val split. 
    -If int, represents the absolute number of val samples.
    -Train size is complement of val size
    
    """
    #split data into speckle/nonspeckle
    df_s = df.loc[df["SPECKLE"] == 1] #speckle
    df_ns = df.loc[df["SPECKLE"] == 0] #nonspeckle
    
    #split speckle/nonspeckle data into train/validation
    print("For speckle data:")
    X_train_s, X_val_s, y_train_s, y_val_s = randomsamp(df_s,val_size=speckle_test_size)
    print("\nFor non-speckle data:")
    X_train_ns, X_val_ns, y_train_ns, y_val_ns = randomsamp(df_ns,val_size=nonspeckle_test_size)
    
    #concat the speckle/non-speckle train and validation
    X_train = pd.concat([X_train_s,X_train_ns], ignore_index=True)
    X_val = pd.concat([X_val_s,X_val_ns], ignore_index=True)
    y_train = pd.concat([y_train_s,y_train_ns], ignore_index=True)
    y_val = pd.concat([y_val_s,y_val_ns], ignore_index=True)
    
    X_train = X_train.reset_index(drop=True)
    X_val = X_val.reset_index(drop=True)
    y_train = y_train.reset_index(drop=True)
    y_val = y_val.reset_index(drop=True)  
    
    print("\nFinal dataset:")
    print("Distribution of y_train:",Counter(y_train))
    print("Distribution of y_val:",Counter(y_val))
    return X_train, X_val, y_train, y_val

In [None]:
df["SPECKLE"].value_counts()

In [None]:
#train-test split on whole data
X_train, X_val, y_train,y_val = randomsamp(df,val_size=0.3)

In [None]:
#train-test split on speckle/non-speckle then merge
X_train, X_val, y_train, y_val = targetrandomsamp(df,speckle_test_size=500,nonspeckle_test_size=2000)

### 6) Remove supporting features

In [None]:
supporting_fs = ['VID','ULT@MIDAS_6261_U1', 'SORTLOT', 'SORTLOT7', 'WAFER',
       'XLOC', 'YLOC', 'ULT@MIDAS_6261_U2', 'IB@6261[CLASSHOT]',
       'FB@6261[CLASSHOT]', 'TEST RESULTS', 'TEST RESULTS BITS', 'TR_BITS',
       'INCOMING', 'INCOMING BITS', 'INC_BITS', 'OUTGOING', 'OUTGOING BITS',
       'OUT_BITS', 'DELTA']
X_train_sf = X_train[supporting_fs] #keep supporting fs of X_train
X_val_sf = X_val[supporting_fs] #keep supporting fs of X_val
X_train = X_train.drop(supporting_fs,axis=1) #remove supporting fs from X_train
X_val = X_val.drop(supporting_fs,axis=1) #remove supporting fs from X_val

### 7) Scaling the data (Standardization/Normalization)

In [None]:
#To scale X train and X validation (test) data using various scalers
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

def scale_data(X_train,X_val,scaler_type):
    """
    Scaling X train and validation with standardization or normalization
    params:
    X_train[DataFrame]: input X train
    X_val[DataFrame]: input X validation (test)
    scaler_type[None/string]: input scaling method
    - "Standardization" for Standard Scaler
    - "Normalization" for Min Max Scaler    

    """           
    if scaler_type == "Standardization":
        scaler = StandardScaler()
    if scaler_type == "Normalization":
        scaler = MinMaxScaler()

    X_train_scaled = scaler.fit_transform(X_train)
    X_train_scaled = pd.DataFrame(X_train_scaled,columns= X_train.columns)
    X_val_scaled = scaler.transform(X_val)
    X_val_scaled = pd.DataFrame(X_val_scaled,columns= X_val.columns)
       
    return X_train_scaled, X_val_scaled

In [None]:
#X train-test scaled using Min max Scaler
X_train_scaled, X_val_scaled = scale_data(X_train,X_val,scaler_type = "Normalization")

### 8) Handling highly correlated features (features vs features/features vs target)

#### a) features vs features

In [None]:
import time
import numpy as np
start = time.time()

#To create correlation matrix for X_train (features)
correlation_matrix = X_train_scaled.corr().abs()

#To select the upper trigular matrix from the correlation matrix of features
upper_tri = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape),k=1).astype(np.bool))
display(upper_tri)

highly_corr_features_95 = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]

end = time.time()
print ("Time elapsed:", end - start)

In [None]:
len(highly_corr_features_95)

In [None]:
#Display correlation matrix pairs (sorted in descending order)
correlation_matrix_stack = X_train_scaled.corr().abs()

#the matrix is symmetric so we need to extract upper triangle matrix without diagonal (k = 1)

upper_tri_stack = (correlation_matrix_stack.where(np.triu(np.ones(correlation_matrix_stack.shape), k=1).astype(np.bool))
                  .stack()
                  .sort_values(ascending=False))

#first element of upper_tri_stack series is the pair with the biggest correlation
display(upper_tri_stack)

In [None]:
highly_corr_features_df = pd.DataFrame (highly_corr_features_95, columns = ['Feature'])
#highly_corr_features_df.to_csv(path+"/DataPreparation/highly_corr_features_95.csv",index=False)

In [None]:
#Dropping the columns with high correlation
X_train_scaled = X_train_scaled.drop(highly_corr_features_95, axis=1)
X_train_scaled

#### b) features vs target

In [None]:
#To create correlation matrix for X_train (features) and y_train (target)
correlation_matrix_target = X_train_scaled.corrwith(y_train, axis = 0).abs()
display(correlation_matrix_target)

In [None]:
correlation_matrix_target_df = pd.DataFrame(correlation_matrix_target)
correlation_matrix_target_df

In [None]:
# adding column name to the respective columns
correlation_matrix_target_df.columns =['Variables']
display(correlation_matrix_target_df)

### B) Sampling to address class imbalance

In [None]:
Counter(y_train)

In [None]:
from imblearn.over_sampling import SMOTE 
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.over_sampling import ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from collections import Counter

#SMOTE
def SMOTE_sampling(X_train_scaled,y_train,over_amt,under_amt=None):
    """
    Can choose SMOTE or SMOTE + random undersampling
    X_train_scaled[DataFrame]: X_train after scaling
    y_train[DataFrame] : target in train data
    over_amt [int] : amount of synthetic data to be generated with SMOTE (speckle)
    under_amt[int/None]: int - amount of nonspeckle data after undersampling
                         None - No undersampling is done
    
    """
    print("Class distribution before sampling:", Counter(y_train))
    print("Ratio of class distribution before sampling :",round(Counter(y_train)[0]/Counter(y_train)[1],2))
    
    if under_amt == None: #SMOTE only
        sm = SMOTE(sampling_strategy= {1:over_amt},random_state=42)
        X_s, y_s = sm.fit_resample(X_train_scaled, y_train)
        
    else: #SMOTE + random undersampling
        sm = SMOTE(sampling_strategy={1:over_amt},random_state=42)
        under = RandomUnderSampler(sampling_strategy={0:under_amt},random_state=42)
        pipeline = Pipeline(steps=[('o', sm), ('u', under)])
        X_s, y_s = pipeline.fit_resample(X_train_scaled, y_train)    
         
    print("Class distribution after sampling:", Counter(y_s))     
    print("Ratio of class distribution after sampling :",round(Counter(y_s)[0]/Counter(y_s)[1],2)) 
    
    return X_s, y_s

#BorderlineSMOTE
def BorderlineSMOTE_sampling(X_train_scaled,y_train,over_amt,under_amt=None):
    """
    Can choose BorderlineSMOTE or BorderlineSMOTE + random undersampling
    X_train_scaled[DataFrame]: X_train after scaling
    y_train[DataFrame] : target in train data
    supporting_fs[list] : list of supporting features to be removed from X_train_scaled
    over_amt [int] : amount of synthetic data to be generated with BorderlineSMOTE
    under_amt[int/None]: int - amount of nonspeckle data after undersampling
                         None - No undersampling is done
    
    """
    print("Class distribution before sampling:", Counter(y_train))
    print("Ratio of class distribution before sampling :",round(Counter(y_train)[0]/Counter(y_train)[1],2))
    
    if under_amt == None: #BorderlineSMOTE only
        bsm = BorderlineSMOTE(sampling_strategy= {1:over_amt},random_state=42)
        X_bs, y_bs = bsm.fit_resample(X_train_scaled, y_train)
        
    else: #BorderlineSMOTE + random undersampling
        bsm = BorderlineSMOTE(sampling_strategy={1:over_amt},random_state=42)
        under = RandomUnderSampler(sampling_strategy={0:under_amt},random_state=42)
        pipeline = Pipeline(steps=[('o', bsm), ('u', under)])
        X_bs, y_bs = pipeline.fit_resample(X_train_scaled, y_train)    
         
    print("Class distribution after sampling:", Counter(y_bs))     
    print("Ratio of class distribution after sampling :",round(Counter(y_bs)[0]/Counter(y_bs)[1],2)) 
    
    return X_bs, y_bs

#ADASYN
def ADASYN_sampling(X_train_scaled,y_train,over_amt,under_amt=None):
    """
    Can choose ADASYN or ADASYN + random undersampling
    X_train_scaled[DataFrame]: X_train after scaling
    y_train[DataFrame] : target in train data
    supporting_fs[list] : list of supporting features to be removed from X_train_scaled
    over_amt [int] : amount of synthetic data to be generated with ADASYN
    under_amt[int/None]: int - amount of nonspeckle data after undersampling
                         None - No undersampling is done
    
    """
    print("Class distribution before sampling:", Counter(y_train))
    print("Ratio of class distribution before sampling :",round(Counter(y_train)[0]/Counter(y_train)[1],2))
    
    if under_amt == None: #ADASYN only
        ad = ADASYN(sampling_strategy= {1:over_amt},random_state=42)
        X_a, y_a = ad.fit_resample(X_train_scaled, y_train)
        
    else: #ADASYN + random undersampling
        ad = ADASYN(sampling_strategy={1:over_amt},random_state=42)
        under = RandomUnderSampler(sampling_strategy={0:under_amt},random_state=42)
        pipeline = Pipeline(steps=[('o', ad), ('u', under)])
        X_a, y_a = pipeline.fit_resample(X_train_scaled, y_train)    
         
    print("Class distribution after sampling:", Counter(y_a))     
    print("Ratio of class distribution after sampling :",round(Counter(y_a)[0]/Counter(y_a)[1],2)) 
    
    return X_a, y_a

In [None]:
#SMOTE
X_s, y_s = SMOTE_sampling(X_train_scaled,y_train,over_amt=2000,under_amt=2000)      

In [None]:
#BorderlineSMOTE
X_bs, y_bs = BorderlineSMOTE_sampling(X_train_scaled,y_train,over_amt=2000,under_amt=2000)      

In [None]:
#ADASYN
X_a, y_a = ADASYN_sampling(X_train_scaled,y_train,over_amt=2000,under_amt=None)      

In [None]:
#create new column for target
# import numpy as np
# df["SPECKLE"] = np.where(df["DELTA"]==0,0,1)
# cols = df.columns.tolist()
# cols = [cols[-1]]+cols[:-1] #move speckle col to the front
# df = df.reindex(columns=cols)

In [None]:
# df.to_csv(path+"SNR_R2_ww35.2_Speckle.csv",index=False)

In [None]:
# supporting_fs = ['ULT@MIDAS_6261_U1', 'SORTLOT', 'SORTLOT7', 'WAFER',
#        'XLOC', 'YLOC', 'ULT@MIDAS_6261_U2', 'IB@6261[CLASSHOT]',
#        'FB@6261[CLASSHOT]','ARR_SPECKLE_SCREEN::PHYTON_X_USERFUNC_K_END_X_X_X_X_PBISTSPECKLE_INDICATOR::DUTCPUBAD_1@6261[CLASSHOT]',
#        'TEST RESULTS', 'TEST RESULTS BITS', 'TR_BITS', 'INCOMING',
#        'INCOMING BITS', 'INC_BITS', 'OUTGOING', 'OUTGOING BITS', 'OUT_BITS','DELTA']
# df1 = df.drop(supporting_fs,axis=1) 
# df1.head()

### 3) Feature Selection

In [None]:
#make folder for featureselection if it doesn't exist
import os
featuresel_path = path + 'FeatureSelection/' 
if not os.path.exists(featuresel_path):
    os.makedirs(featuresel_path)
    print("FeatureSelection folder created!")

#### a) ANOVA

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline

def anova_fs(X_train_scaled,y_train):
    """
    Anova feature selection. Return the top k features 
    X_train_scaled[df]: X of train data
    y_train[df]: y of train data (target)
    num_top_fs[int/"all"]: int -> specify the number of top fs to be returned,
                                  returns top fs and their scores in descending order
                           "all" -> returns all fs and their scores in descending order
    """
    #Select k features with top Fisher scores
    anova = SelectKBest(f_classif, k = "all")    
    anova.fit(X_train_scaled, y_train)
            
    #get the top feature names
    fs_names = X_train_scaled.columns.values[anova.get_support()] 

    #get the top feature scores
    scores = anova.scores_[anova.get_support()] 
    
    #dataframe with feature name and score in descending order
    names_scores = list(zip(fs_names, scores))
    ns_df = pd.DataFrame(data = names_scores, columns=['Feature', 'Scores'])
    ns_df = ns_df.sort_values(by=['Scores'], ascending=False)    
    ns_df['Scores_Scaled'] = ns_df['Scores']/ns_df['Scores'].sum()
    ns_df['Scores_Scaled_Cumulative %'] = ns_df['Scores_Scaled'].cumsum()*100
    ns_df = ns_df.reset_index(drop=True)
    
    return anova, ns_df

In [None]:
anova,ns_df = anova_fs(X_train_scaled,y_train)
print(anova)
ns_df

In [None]:
ns_df.to_csv(featuresel_path+"Anova_Fs.csv",index=False)

In [None]:
#take top n important features only in train and validation
n=200
features = ns_df.head(n)[["Feature"]] #choose the top n important features
X_train_scaled = X_train_scaled[features["Feature"]] #filter X_train_scaled with top n important features
X_val_scaled = X_val_scaled[features["Feature"]] #filter X_val_scaled with top n important features

### b) Boruta

In [None]:
X_train_scaled.shape

In [None]:
y_train.value_counts()

In [None]:
%%time

# Build the grid search on random forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import time

param_grid = {
#     'n_estimators': [100, 300], # number of trees, default is 100 -> not used as Boruta will determine
    'max_depth': [3,5,7], #max_depth of the tree is advised on the Boruta Github page to be between 3 to 7
    'max_features': ['auto','log2'], #number of features to consider when looking for the best split, default auto
    'min_samples_leaf': [1,100], # minimum sample leaf (decision tree end node) size, default is 1
    'class_weight': [None,'balanced'], #weights for unbalanced data     
    'random_state': [42]
}

rf_model = RandomForestClassifier() 
grid = GridSearchCV(estimator=rf_model, param_grid=param_grid,scoring='balanced_accuracy',n_jobs = -1,verbose=2,cv = 5)
grid_results = grid.fit(X_train_scaled,y_train) 

print("Best parameters:", grid_results.best_params_)

#model with best parameters
model = grid_results.best_estimator_ 
print(model)

fs_importance= pd.DataFrame(list(zip(X_train_scaled.columns, model.feature_importances_)), columns=['Feature', 'Importance']).sort_values(by=['Importance'], ascending=False)  
display(fs_importance)

In [None]:
results_df = pd.DataFrame(grid_results.cv_results_)
results_df= results_df.drop(["mean_fit_time","std_fit_time","mean_score_time","std_score_time","split0_test_score","split1_test_score","split2_test_score","split3_test_score","split4_test_score"],axis=1)
results_df

In [None]:
rf_fs_importance.to_csv(featuresel_path+"RandomForest.csv",index=False)

In [None]:
# rf_fs_importance["Importance"].sum()

In [None]:
%%time
# Creating a BorutaPy object with RandomForestClassifier as the estimator and ranking the features
from boruta import BorutaPy
boruta_selector = BorutaPy(best_rf_model, n_estimators='auto', verbose=2, perc = 90,max_iter = 100, random_state=42)                                                      
boruta_selector.fit(np.array(X_train_scaled), np.array(y_train))

In [None]:
print(boruta_selector)
print("No. of significant features: ", boruta_selector.n_features_) 
selected_boruta_fs = pd.DataFrame({'Feature':list(X_train_scaled.columns),'Ranking':boruta_selector.ranking_}).sort_values(by='Ranking')
selected_boruta_fs = selected_boruta_fs[selected_boruta_fs['Ranking']==1]
selected_boruta_fs = selected_boruta_fs[["Feature"]]
selected_boruta_fs_imp = rf_fs_importance.loc[rf_fs_importance['Feature'].isin(selected_boruta_fs['Feature']),:]
selected_boruta_fs_imp

In [None]:
selected_boruta_fs_imp.to_csv(featuresel_path+"Boruta.csv",index=False)

In [None]:
#take important features only in train and validation
X_train_scaled = X_train_scaled[selected_boruta_fs_imp["Feature"]]
X_val_scaled = X_val_scaled[selected_boruta_fs_imp["Feature"]]

### c) Random Forest

In [None]:
X_train_scaled.shape

In [None]:
y_train.value_counts()

In [None]:
# Build the grid search
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import time

start = time.time()

param_grid = {
    'n_estimators': [100, 300], # number of trees, default is 100
    'max_features': ['auto','log2'], #number of features to consider when looking for the best split, default auto
    'min_samples_leaf': [1,100], # minimum sample leaf (decision tree end node) size, default is 1
    'class_weight': [None,'balanced'], #weights for unbalanced data     
    'random_state': [42]
}

# Grid search on random forest
rf_model = RandomForestClassifier() 
grid = GridSearchCV(estimator=rf_model, param_grid=param_grid,scoring='balanced_accuracy',n_jobs = -1,verbose=2,cv = 5)
grid_results = grid.fit(X_train_scaled,y_train) 


# # Print the time spend and number of models ran
# print("GridSearchCV took %.2f seconds for %d candidate parameter settings." % ((time.time() - start), len(grid_results.cv_results_['params'])))

end = time.time()
print ("Time elapsed:", end - start)

In [None]:
print("Best parameters:", grid_results.best_params_)
#model with best parameters
best_rf_model = grid_results.best_estimator_ 
best_rf_model

In [None]:
rf_fs_importance= pd.DataFrame(list(zip(X_train_scaled.columns, best_rf_model.feature_importances_)), columns=['Feature', 'Importance']).sort_values(by=['Importance'], ascending=False)  
rf_fs_importance

In [None]:
results_df = pd.DataFrame(grid_results.cv_results_)
results_df= results_df.drop(["mean_fit_time","std_fit_time","mean_score_time","std_score_time","split0_test_score","split1_test_score","split2_test_score","split3_test_score","split4_test_score"],axis=1)
results_df

### d) XGBoost

In [None]:
# Build the grid search
from xgboost import XGBClassifier

start = time.time()

param_grid = {
    'n_estimators': [100, 600],
    'learning_rate': [0.01],
    'min_child_weight': [1, 10],
    'gamma': [0.5, 5],
    'subsample': [0.5, 1.0],
    'colsample_bytree': [0.5, 1.0],
    'max_depth': [3, 5],
    'objective': ['binary:logistic'],
    'nthread': [1],
    'verbosity': [0],
    'random_state': [42]
}

# Grid search on XGBoost
xgb_model = XGBClassifier()
grid = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv = 5)
grid_results = grid.fit(X_train_scaled,y_train) 

# # Print the time spend and number of models ran
# print("GridSearchCV took %.2f seconds for %d candidate parameter settings." % ((time.time() - start), len(grid_results.cv_results_['params'])))

end = time.time()
print ("Time elapsed:", end - start)

In [None]:
print("Best parameters:", grid_results.best_params_)
#model with best parameters
best_xgb_model = grid_results.best_estimator_ 
best_xgb_model

In [None]:
xgb_fs_importance = pd.DataFrame(list(zip(X_train_scaled.columns, best_xgb_model.feature_importances_)), columns=['Feature', 'Importance']).sort_values(by=['Importance'], ascending=False)  
xgb_fs_importance

In [None]:
results_df = pd.DataFrame(grid_results.cv_results_)
results_df= results_df.drop(["mean_fit_time","std_fit_time","mean_score_time","std_score_time","split0_test_score","split1_test_score","split2_test_score","split3_test_score","split4_test_score"],axis=1)
results_df

### f) Ensemble feature selection

In [None]:
best_rf_model

In [None]:
best_xgb_model

In [None]:
%%time
anova._estimator_type = "classifier"
from xgboost import XGBClassifier
best_xgb_model = XGBClassifier(random_state = 42)

#ensemble feature selection
from sklearn.ensemble import VotingClassifier
#specify the estimators to use, specify anova first if it is used
estimators = [('ANOVA',anova),('XGB',best_xgb_model),('RF', best_rf_model)] 
voting_clf = VotingClassifier(estimators=estimators, voting='soft', weights=[1,1,1])
voting_clf.fit(X_train_scaled, y_train)

In [None]:
def compute_feature_importance(voting_clf, weights):
    """ 
    Function to compute feature importance of ensemble feature selection 
    voting_clf = Voting Classifier object
    weights = weights used for each estimator 
    """
    
    feature_importance = dict()
    anova_featureimp = dict()

    try: # anova is not one of the estimators
        for est in voting_clf.estimators_:
            feature_importance[str(est)] = est.feature_importances_ #get the feature imp for each estimator
    except: #if anova is one of the estimators
        #get the feature imp for anova
        anova_featureimp[voting_clf.estimators_[0]] = voting_clf.estimators_[0].scores_/voting_clf.estimators_[0].scores_.sum()
        #get the feature imp for other estimators
        for est in voting_clf.estimators_[1:]: 
            feature_importance[str(est)] = est.feature_importances_ #get the feature imp for each estimator    
        #combine feature imp of anova and other estimators
        feature_importance.update(anova_featureimp)
    
    fe_scores = [0]*len(list(feature_importance.values())[0]) #initialize fe_scores as 0 
    for idx, imp_score in enumerate(feature_importance.values()): #idx:estimator index, imp_score: feature imp of est
        imp_score_with_weight = imp_score*weights[idx] #Multiply the weights of the base estimator to the importance score of each of the features.
        fe_scores = list(np.add(fe_scores, list(imp_score_with_weight)))
        
    return fe_scores

ensemble_fs = pd.DataFrame()
ensemble_fs['Feature'] = X_train_scaled.columns
ensemble_fs['Feature Importance'] = compute_feature_importance(voting_clf,weights = [1, 1, 1])
ensemble_fs = ensemble_fs.sort_values('Feature Importance', ascending=False)
ensemble_fs

In [None]:
#take top n important features only in train and validation
n=200
features = ns_df.head(n)[["Feature"]] #choose the top n important features
X_train_scaled = X_train_scaled[features["Feature"]] #filter X_train_scaled with top n important features
X_val_scaled = X_val_scaled[features["Feature"]] #filter X_val_scaled with top n important features

In [None]:
ensemble_fs.to_csv(featuresel_path+"Ensemble_Anova_XGB_RF.csv",index=False)

In [None]:
# df2 = pd.read_csv(path+'DataPreparation/negative_parameters.csv')

In [None]:
# df2.head()

In [None]:
# df_IDV = df2.loc[:, df2.columns.str.startswith("IDV")]
# df_IDV.head()

In [None]:
# df_IDV = df2.loc[:, df2.columns.str.startswith("IDV")]
# df_HVQK = df2.loc[:, df2.columns.str.startswith("HVQK")]
# df_IDV.min()
# df_HVQK.min()

In [None]:
# df_HVQK = df2.loc[:, df2.columns.str.startswith("HVQK")]
# df_HVQK.head()

In [None]:
# df_IDV.min()

In [None]:
# df_HVQK.min()

In [None]:
# df2.loc[:,((df2 == -5555).any()) & ((df2 == -9999).any())]

In [None]:
# import pandas as pd
# path = "C:/Users/nchong/OneDrive - Intel Corporation/Documents/ML based speckle POC/"
# df2 = pd.read_csv(path+"SNR_R5_ww51.4.csv")
# df3 = df2[df2["VID"].isin(row100na_df["VID"])]
# df3 = df3.reset_index(drop=True)
# df3.shape
# df3.to_csv(path+"/DataPreparation/NA_rows_withsf.csv",index=False)

### Model Building

### a) SVM


In [None]:
%%time
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C':[0.1, 1, 10, 100, 1000], #Regularization parameter [0.1, 1, 10, 100, 1000]
    'kernel':['linear', 'poly', 'rbf', 'sigmoid'], #Specifies the kernel type to be used in the algorithm
    'gamma': ['scale', 'auto'],
    'class_weight': [None,'balanced'], #weights for unbalanced data     
    'random_state': [42]
}

# Grid search on SVM
classifier = SVC()
grid = GridSearchCV(estimator=classifier, param_grid=param_grid,scoring='balanced_accuracy',refit = True,n_jobs = -1,verbose=2,cv = 5)
grid_results = grid.fit(X_train_scaled,y_train) 

print("Best parameters:", grid_results.best_params_)

#model with best parameters
model = grid_results.best_estimator_ 
print(model)

# fs_importance= pd.DataFrame(list(zip(X_train_scaled.columns, model.feature_importances_)), columns=['Feature', 'Importance']).sort_values(by=['Importance'], ascending=False)  
# display(fs_importance)

In [None]:
# from joblib import load
# grid_results= load(model_path+'SVMmodel.joblib') 

In [None]:
#check grid search results
results_df = pd.DataFrame(grid_results.cv_results_)
results_df= results_df.drop(["mean_fit_time","std_fit_time","mean_score_time","std_score_time","split0_test_score","split1_test_score","split2_test_score","split3_test_score","split4_test_score"],axis=1)
results_df

In [None]:
import joblib
from sklearn import metrics

#make folder for SVM if it doesn't exist
import os
model_path = path + 'ModelBuilding/SVM/' 
if not os.path.exists(model_path):
    os.makedirs(model_path)

#MODEL SAVING   
joblib.dump(model, model_path+"SVMmodel.joblib")

# Predicting the classes for validation set
y_pred = model.predict(X_val_scaled)

#overall accuracy
overall_acc = round(metrics.accuracy_score(y_val, y_pred)*100,2)
overall_acc = {'Overall Acc %':overall_acc}
overall_acc = pd.DataFrame([overall_acc])
overall_acc.to_csv(model_path+"Overall_Accuracy.csv")

#classification report
report = metrics.classification_report(y_val, y_pred,zero_division=0,output_dict=True)
report = pd.DataFrame(report).transpose()
report.to_csv(model_path+"Classification_Report.csv")

#confusion matrix with accuracies for each label
class_accuracies = []

for class_ in y_val.sort_values(ascending= True).unique():
    class_acc = round(np.mean(y_pred[y_val == class_] == class_)*100,2)
    class_accuracies.append(class_acc)
    
class_acc = pd.DataFrame(class_accuracies,index=['true:0', 'true:1'],columns= ["Accuracy %"])

cf_matrix = pd.DataFrame(
    metrics.confusion_matrix(y_val, y_pred, labels= [0, 1]), 
    index=['true:0', 'true:1'], 
    columns=['pred:0', 'pred:1']
)

ascend = None #input None/True/False to order the confusion matrix
if ascend == None:
    cf_matrix = pd.concat([cf_matrix,class_acc],axis=1)
else:
    cf_matrix = pd.concat([cf_matrix,class_acc],axis=1).sort_values(by=['Accuracy %'], ascending=ascend)

cf_matrix.to_csv(model_path+"Confusion_Matrix.csv",index=False)   

#validation results 
val_results = pd.concat([X_val_sf,X_val_scaled,pd.DataFrame(y_val),pd.DataFrame(y_pred,columns = ["PRED_SPECKLE"])],axis=1)
val_results.to_csv(model_path+"Val_results.csv",index=False) 


In [None]:
y_val.value_counts()

In [None]:
display(cf_matrix)
display(overall_acc)
display(report)

### b) Random Forest


In [None]:
import time

In [None]:
%%time
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 300], # number of trees, default is 100
    'max_features': ['auto','log2'], #number of features to consider when looking for the best split, default auto
    'min_samples_leaf': [1,100], # minimum sample leaf (decision tree end node) size, default is 1
    'class_weight': [None,'balanced'], #weights for unbalanced data     
    'random_state': [42]
}

# Grid search on random forest
classifier = RandomForestClassifier() 
grid = GridSearchCV(estimator=classifier, param_grid=param_grid,scoring='balanced_accuracy',n_jobs = -1,verbose=2,cv = 5)
grid_results = grid.fit(X_train_scaled,y_train) 

print("Best parameters:", grid_results.best_params_)

#model with best parameters
model = grid_results.best_estimator_ 
print(model)

# fs_importance= pd.DataFrame(list(zip(X_train_scaled.columns, model.feature_importances_)), columns=['Feature', 'Importance']).sort_values(by=['Importance'], ascending=False)  
# display(fs_importance)

In [None]:
#from joblib import load
#grid_results= load(path+'RFmodel.joblib') 

In [None]:
#check grid search results
results_df = pd.DataFrame(grid_results.cv_results_)
results_df= results_df.drop(["mean_fit_time","std_fit_time","mean_score_time","std_score_time","split0_test_score","split1_test_score","split2_test_score","split3_test_score","split4_test_score"],axis=1)
results_df

In [None]:
import joblib
from sklearn import metrics

#make folder for Random Forest if it doesn't exist
import os
model_path = path + 'ModelBuilding/RF/' 
if not os.path.exists(model_path):
    os.makedirs(model_path)

#MODEL SAVING   
joblib.dump(model, model_path+"RFmodel.joblib")

# Predicting the classes for validation set
y_pred = model.predict(X_val_scaled)

#overall accuracy
overall_acc = round(metrics.accuracy_score(y_val, y_pred)*100,2)
overall_acc = {'Overall Acc %':overall_acc}
overall_acc = pd.DataFrame([overall_acc])
overall_acc.to_csv(model_path+"Overall_Accuracy.csv")

#classification report
report = metrics.classification_report(y_val, y_pred,zero_division=0,output_dict=True)
report = pd.DataFrame(report).transpose()
report.to_csv(model_path+"Classification_Report.csv")

#confusion matrix with accuracies for each label
class_accuracies = []

for class_ in y_val.sort_values(ascending= True).unique():
    class_acc = round(np.mean(y_pred[y_val == class_] == class_)*100,2)
    class_accuracies.append(class_acc)
    
class_acc = pd.DataFrame(class_accuracies,index=['true:0', 'true:1'],columns= ["Accuracy %"])

cf_matrix = pd.DataFrame(
    metrics.confusion_matrix(y_val, y_pred, labels= [0, 1]), 
    index=['true:0', 'true:1'], 
    columns=['pred:0', 'pred:1']
)

ascend = None #input None/True/False to order the confusion matrix
if ascend == None:
    cf_matrix = pd.concat([cf_matrix,class_acc],axis=1)
else:
    cf_matrix = pd.concat([cf_matrix,class_acc],axis=1).sort_values(by=['Accuracy %'], ascending=ascend)

cf_matrix.to_csv(model_path+"Confusion_Matrix.csv",index=False)   

#validation results 
val_results = pd.concat([X_val_sf,X_val_scaled,pd.DataFrame(y_val),pd.DataFrame(y_pred,columns = ["PRED_SPECKLE"])],axis=1)
val_results.to_csv(model_path+"Val_results.csv",index=False) 


In [None]:
y_val.value_counts()

In [None]:
display(cf_matrix)
display(overall_acc)
display(report)

### c) XGBoost


In [None]:
import time

In [None]:
%%time
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 600],
    'learning_rate': [0.01],
    'min_child_weight': [1, 10],
    'gamma': [0.5, 5],
    'subsample': [0.5, 1.0],
    'colsample_bytree': [0.5, 1.0],
    'max_depth': [3, 5],
    'objective': ['binary:logistic'],
    'nthread': [1],
    'verbosity': [0],
    'random_state': [42]
}

# Grid search on XGBoost
classifier = XGBClassifier()
grid = GridSearchCV(estimator=classifier, param_grid=param_grid, cv = 5)
grid_results = grid.fit(X_train_scaled,y_train) 

print("Best parameters:", grid_results.best_params_)

#model with best parameters
model = grid_results.best_estimator_ 
print(model)

# fs_importance= pd.DataFrame(list(zip(X_train_scaled.columns, model.feature_importances_)), columns=['Feature', 'Importance']).sort_values(by=['Importance'], ascending=False)  
# display(fs_importance)

In [None]:
# from joblib import load
# grid_results= load(model_path+'XGBmodel.joblib') 

In [None]:
#check grid search results
results_df = pd.DataFrame(grid_results.cv_results_)
results_df= results_df.drop(["mean_fit_time","std_fit_time","mean_score_time","std_score_time","split0_test_score","split1_test_score","split2_test_score","split3_test_score","split4_test_score"],axis=1)
results_df

In [None]:
import joblib
from sklearn import metrics

#make folder for XGBoost if it doesn't exist
import os
model_path = path + 'ModelBuilding/XGB/' 
if not os.path.exists(model_path):
    os.makedirs(model_path)

#MODEL SAVING   
joblib.dump(model, model_path+"XGBmodel.joblib")

# Predicting the classes for validation set
y_pred = model.predict(X_val_scaled)

#overall accuracy
overall_acc = round(metrics.accuracy_score(y_val, y_pred)*100,2)
overall_acc = {'Overall Acc %':overall_acc}
overall_acc = pd.DataFrame([overall_acc])
overall_acc.to_csv(model_path+"Overall_Accuracy.csv")

#classification report
report = metrics.classification_report(y_val, y_pred,zero_division=0,output_dict=True)
report = pd.DataFrame(report).transpose()
report.to_csv(model_path+"Classification_Report.csv")

#confusion matrix with accuracies for each label
class_accuracies = []

for class_ in y_val.sort_values(ascending= True).unique():
    class_acc = round(np.mean(y_pred[y_val == class_] == class_)*100,2)
    class_accuracies.append(class_acc)
    
class_acc = pd.DataFrame(class_accuracies,index=['true:0', 'true:1'],columns= ["Accuracy %"])

cf_matrix = pd.DataFrame(
    metrics.confusion_matrix(y_val, y_pred, labels= [0, 1]), 
    index=['true:0', 'true:1'], 
    columns=['pred:0', 'pred:1']
)

ascend = None #input None/True/False to order the confusion matrix
if ascend == None:
    cf_matrix = pd.concat([cf_matrix,class_acc],axis=1)
else:
    cf_matrix = pd.concat([cf_matrix,class_acc],axis=1).sort_values(by=['Accuracy %'], ascending=ascend)

cf_matrix.to_csv(model_path+"Confusion_Matrix.csv",index=False)   

#validation results 
val_results = pd.concat([X_val_sf,X_val_scaled,pd.DataFrame(y_val),pd.DataFrame(y_pred,columns = ["PRED_SPECKLE"])],axis=1)
val_results.to_csv(model_path+"Val_results.csv",index=False) 


In [None]:
y_val.value_counts()

In [None]:
display(cf_matrix)
display(overall_acc)
display(report)

### d) Deep Learning - MLP

In [None]:
%%time
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    'hidden_layer_sizes': [(50,50)],
    'activation': ['logistic','tanh'],#['identity', 'logistic', 'tanh', 'relu']
    'solver': ['adam'], #['lbfgs', 'sgd', 'adam']
#     'learning_rate': ['constant', 'invscaling', 'adaptive'],
    'random_state': [42]
}

# Grid search on MLP
classifier = MLPClassifier()
grid = GridSearchCV(estimator=classifier, param_grid=param_grid,scoring='balanced_accuracy',refit = True,n_jobs = -1,verbose=2,cv = 5)
grid_results = grid.fit(X_train_scaled,y_train) 

print("Best parameters:", grid_results.best_params_)

#model with best parameters
model = grid_results.best_estimator_ 
print(model)


In [None]:
#check grid search results
results_df = pd.DataFrame(grid_results.cv_results_)
results_df= results_df.drop(["mean_fit_time","std_fit_time","mean_score_time","std_score_time","split0_test_score","split1_test_score","split2_test_score","split3_test_score","split4_test_score"],axis=1)
results_df

In [None]:
import joblib
from sklearn import metrics

#make folder for MLP if it doesn't exist
import os
model_path = path + 'ModelBuilding/MLP/' 
if not os.path.exists(model_path):
    os.makedirs(model_path)

#MODEL SAVING   
joblib.dump(model, model_path+"MLPmodel.joblib")

# Predicting the classes for validation set
y_pred = model.predict(X_val_scaled)

#overall accuracy
overall_acc = round(metrics.accuracy_score(y_val, y_pred)*100,2)
overall_acc = {'Overall Acc %':overall_acc}
overall_acc = pd.DataFrame([overall_acc])
overall_acc.to_csv(model_path+"Overall_Accuracy.csv")

#classification report
report = metrics.classification_report(y_val, y_pred,zero_division=0,output_dict=True)
report = pd.DataFrame(report).transpose()
report.to_csv(model_path+"Classification_Report.csv")

#confusion matrix with accuracies for each label
class_accuracies = []

for class_ in y_val.sort_values(ascending= True).unique():
    class_acc = round(np.mean(y_pred[y_val == class_] == class_)*100,2)
    class_accuracies.append(class_acc)
    
class_acc = pd.DataFrame(class_accuracies,index=['true:0', 'true:1'],columns= ["Accuracy %"])

cf_matrix = pd.DataFrame(
    metrics.confusion_matrix(y_val, y_pred, labels= [0, 1]), 
    index=['true:0', 'true:1'], 
    columns=['pred:0', 'pred:1']
)

ascend = None #input None/True/False to order the confusion matrix
if ascend == None:
    cf_matrix = pd.concat([cf_matrix,class_acc],axis=1)
else:
    cf_matrix = pd.concat([cf_matrix,class_acc],axis=1).sort_values(by=['Accuracy %'], ascending=ascend)

cf_matrix.to_csv(model_path+"Confusion_Matrix.csv",index=False)   

#validation results 
val_results = pd.concat([X_val_sf,X_val_scaled,pd.DataFrame(y_val),pd.DataFrame(y_pred,columns = ["PRED_SPECKLE"])],axis=1)
val_results.to_csv(model_path+"Val_results.csv",index=False) 


### e) Deep Learning - TabNet

In [None]:
%%time
import pytorch_tabnet
from pytorch_tabnet.tab_model import TabNetClassifier
import torch
from sklearn.model_selection import GridSearchCV

param_grid = {
    'hidden_layer_sizes': [(50,50)],
    'activation': ['logistic','tanh'],#['identity', 'logistic', 'tanh', 'relu']
    'solver': ['adam'], #['lbfgs', 'sgd', 'adam']
#     'learning_rate': ['constant', 'invscaling', 'adaptive'],
    'random_state': [42]
}

# Grid search on MLP
classifier = TabNetClassifier()
grid = GridSearchCV(estimator=classifier, param_grid=param_grid,scoring='balanced_accuracy',refit = True,n_jobs = -1,verbose=2,cv = 5)
grid_results = grid.fit(X_train_scaled,y_train) 

print("Best parameters:", grid_results.best_params_)

#model with best parameters
model = grid_results.best_estimator_ 
print(model)


In [None]:
%%time
import pytorch_tabnet
from pytorch_tabnet.tab_model import TabNetClassifier
import torch
from sklearn.model_selection import GridSearchCV



In [None]:
%%time
import pytorch_tabnet
from pytorch_tabnet.tab_model import TabNetClassifier
import torch
from sklearn.model_selection import GridSearchCV

param_grid = {
    'optimizer_fn': [torch.optim.Adam]
    'optimizer_params': [dict(lr=2e-2)],
    'scheduler_params': ['gamma': 0.95, 'step_size': 10],
    'scheduler_fn': [torch.optim.lr_scheduler.StepLR],
    'mask_type': ['entmax'], #['entmax', 'sparsemax']
    'random_state': [42]
}

# Grid search on TabNet
classifier = TabNetClassifier()
grid = GridSearchCV(estimator=classifier, param_grid=param_grid)
grid_results = grid.fit(X_train_scaled,y_train, eval_set=[(X_train_scaled, y_train), (X_val_scaled, y_val)], 
                    eval_name=['train', 'valid'], max_epochs=1000, patience=50, batch_size=1580, virtual_batch_size=790, 
                    num_workers=0, weights=1, drop_last=False)
print("Best parameters:", grid_results.best_params_)

#model with best parameters
model = grid_results.best_estimator_ 
print(model)

In [None]:
import pytorch_tabnet
from pytorch_tabnet.tab_model import TabNetClassifier
import torch
from sklearn.metrics import roc_auc_score, accuracy_score

# define the model
classifier = TabNetClassifier(optimizer_fn=torch.optim.Adam,
                       optimizer_params=dict(lr=2e-2),
                       scheduler_params={"step_size":10, # how to use learning rate scheduler
                                         "gamma":0.9},
                       scheduler_fn=torch.optim.lr_scheduler.StepLR,
                       mask_type='entmax' # "sparsemax"
                      )

# fit the model 
classifier.fit(
    X_train_scaled,y_train,
    eval_set=[(X_train_scaled, y_train), (X_val_scaled, y_val)],
    eval_name=['train', 'valid'],
    eval_metric=['auc','accuracy'],
    max_epochs=1000 , patience=50,
    batch_size=256, virtual_batch_size=128,
    num_workers=0,
    weights=1,
    drop_last=False
)     



#Notes:
#batch_size=15797*0.1=1580
#virtual_batch_size=15797*0.05=790
#X_train, X_val, y_train, y_val

In [None]:
# import numpy as np


# def plot_confusion_matrix(cm,
#                           target_names,
#                           title='Confusion matrix',
#                           cmap=None,
#                           normalize=True):
#     """
#     given a sklearn confusion matrix (cm), make a nice plot

#     Arguments
#     ---------
#     cm:           confusion matrix from sklearn.metrics.confusion_matrix

#     target_names: given classification classes such as [0, 1, 2]
#                   the class names, for example: ['high', 'medium', 'low']

#     title:        the text to display at the top of the matrix

#     cmap:         the gradient of the values displayed from matplotlib.pyplot.cm
#                   see http://matplotlib.org/examples/color/colormaps_reference.html
#                   plt.get_cmap('jet') or plt.cm.Blues

#     normalize:    If False, plot the raw numbers
#                   If True, plot the proportions

#     Usage
#     -----
#     plot_confusion_matrix(cm           = cm,                  # confusion matrix created by
#                                                               # sklearn.metrics.confusion_matrix
#                           normalize    = True,                # show proportions
#                           target_names = y_labels_vals,       # list of names of the classes
#                           title        = best_estimator_name) # title of graph

#     Citiation
#     ---------
#     http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html

#     """
#     import matplotlib.pyplot as plt
#     import numpy as np
#     import itertools

#     accuracy = np.trace(cm) / np.sum(cm).astype('float')
#     misclass = 1 - accuracy

#     if cmap is None:
#         cmap = plt.get_cmap('Blues')

#     plt.figure(figsize=(8, 6))
#     plt.imshow(cm, interpolation='nearest', cmap=cmap)
#     plt.title(title)
#     plt.colorbar()

#     if target_names is not None:
#         tick_marks = np.arange(len(target_names))
#         plt.xticks(tick_marks, target_names, rotation=45)
#         plt.yticks(tick_marks, target_names)

#     if normalize:
#         cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]


#     thresh = cm.max() / 1.5 if normalize else cm.max() / 2
#     for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
#         if normalize:
#             plt.text(j, i, "{:0.4f}".format(cm[i, j]),
#                      horizontalalignment="center",
#                      color="white" if cm[i, j] > thresh else "black")
#         else:
#             plt.text(j, i, "{:,}".format(cm[i, j]),
#                      horizontalalignment="center",
#                      color="white" if cm[i, j] > thresh else "black")


#     plt.tight_layout()
#     plt.ylabel('Actual')
#     plt.xlabel('Predicted \naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
#     plt.show()

In [None]:
# cm = metrics.confusion_matrix(y_val, y_pred)
# plot_confusion_matrix(cm=cm,target_names=[0,1],title='Confusion matrix', cmap=None,normalize=True)
                          