# Explainable-AI model on iDetect dataset:

In [284]:
import pandas as pd

# set variables - 
CLASSIFICATION_TYPE = 'Binary' # select 'Binary' or 'Multi_Class'
SAVE_DIR = '../data/iDetect_refine/'

In [285]:

file_part = CLASSIFICATION_TYPE + '.csv'

if CLASSIFICATION_TYPE == 'Binary' or CLASSIFICATION_TYPE == 'Multi_Class':
        
    CNN_data = '../data/iDetect/CNN-Al_Boghdady_' + file_part
    RNN_data = '../data/iDetect/RNN-Al_Boghdady_' + file_part
    RF_data = '../data/iDetect/RF-Al_Boghdady_' + file_part
    
    df_CNN = pd.read_csv(CNN_data, encoding= 'unicode_escape')
    df_RNN = pd.read_csv(RNN_data, encoding= 'unicode_escape')
    df_RF = pd.read_csv(RF_data, encoding= 'unicode_escape')
    
else:
    print("Invalid classification type selected, selected either binary or multi, but not - ", CLASSIFICATION_TYPE)

if not df_RF.empty and not df_CNN.empty and not df_RF.empty:
    # Correcting the labeling of RF data file. 
    df_RF['isMalicious'] = df_RF['isMalicious'].map({'Vulnerable Code':1, 'Benign Code':0})
    print('df_CNN shape: ', df_CNN.shape)
    print(df_CNN.head(5))

df_CNN shape:  (5117, 2)
                                                code  isMalicious
0  void printfUART_buf  ( char *buf ,   int  len ...            1
1  check_opt_size  ( cp_opt_t *opt ,  unsigned ch...            1
2  cp_ft  ( cp_queue_t *queue ,  cp_tid_t id )   ...            1
3  start  ( cp_pdu_t *pdu )   {  if   ( pdu && pd...            1
4  cp_clone_pdu  ( cp_pdu_t *pdu )   {  cp_pdu_t ...            1


## Finding differences in _iDetect/CNN/*.csv_ and _iDetect/RNN/*.csv_ data files. 
- <Guru> I think these are duplicate files, but the author of iDetect put the same files in multiple places,
therefore let's check them if they are same or not. 
- Follow - https://stackoverflow.com/questions/20225110/comparing-two-dataframes-and-getting-the-differences

In [286]:
def drop_ambiguous(df):
    """
    # Step 4: groupby based on 'code' column, make a list of ambiguous 'isMalicious' labels, 
    # find a set of the union elements are elected as ambiguous samples.
    # <need attention> should we pick malicious sample if it also labeled as non-malicious?
    """ 
    df = df.groupby(['code'], as_index=False)['isMalicious']\
        .agg(lambda x: list(set(list(x))) if len(list(set(list(x))))>1 else x)\
            .reset_index(drop=True)
    filter_amb  = [True if type(x)!=list else False for x in df.isMalicious]
    return df[filter_amb].reset_index(drop=True)
            
        
def filter_dup_rows(df):
    """
    drop duplicate rows - by replacing multiple whitespaces with single.
    """
    if len(df)>0:
        df['code'] = df.code.replace(r'\s+', ' ', regex=True)
        
        # Step 1 - drop duplicates
        df = df.drop_duplicates(keep='first').reset_index(drop=True)
        
        # Step 2 - drop nan values
        df = df[df.isMalicious.notna()] 
        
        # Step 3 - drop very short code samples
        df = df[df.code.str.len()>5] 
        
        # Step 4: droping ambiquous samples from the dataset which have 'isMalicious' value as a list of items \
            # than a single valued label. 
        df = drop_ambiguous(df).reset_index(drop=True) 
        
        
        assert df.code.nunique()==len(df), f"Same code(s) has multiple labels (ambiguous samples) \n \
            - (unique rows on code):{df.code.nunique()} Vs (total rows):{len(df)}"
        return df
    else:
        print('Filtering is not possible with empty dataframes.')
        return None


def diff_dfs(df1, df2):
    """
    find df_dff - different of two dataframes including the duplicates inside themselves. 
    return - union(df1, df2)
    """
    if df1 is not None and df2 is not None:
        df1 = filter_dup_rows(df1)
        df2 = filter_dup_rows(df2)
        df = pd.concat([df1, df2])
        df = df.drop_duplicates(keep='first').reset_index(drop=True)
        df = drop_ambiguous(df)

        if len(df)==len(df1) and len(df)==len(df2):
            print("Both dataframes are identical.")
        return df
    else:
        print('Filtering is not possible with empty dataframe.')
        return None
 

print('\nDataframe of difference between df_CNN and df_RNN:')
df_CNN_RNN = diff_dfs(df_CNN, df_RNN)
print('Shape of the union dataset:', df_CNN_RNN.shape)
# both of them are identical

print('\nDataframe of difference between df_RNN and df_RF:')
df_RNN_RF = diff_dfs(df_RNN, df_RF)
print('Shape of the union dataset:', df_RNN_RF.shape)

print('\nDataframe of difference between df_CNN and df_RF: ')
df_CNN_RF = diff_dfs(df_CNN, df_RF)
print('Shape of the union dataset:', df_CNN_RF.shape)

# difference of two diffs
print('\nDataframe of difference between df_CNN_RF and df_RNN_RF:')
df_CNN_RNN_RF = diff_dfs(df_CNN_RF, df_RNN_RF)
print('Shape of the union dataset:', df_CNN_RNN_RF.shape)


Dataframe of difference between df_CNN and df_RNN:
Both dataframes are identical.
Shape of the union dataset: (4720, 2)

Dataframe of difference between df_RNN and df_RF:
Shape of the union dataset: (6245, 2)

Dataframe of difference between df_CNN and df_RF: 
Shape of the union dataset: (6245, 2)

Dataframe of difference between df_CNN_RF and df_RNN_RF:
Both dataframes are identical.
Shape of the union dataset: (6245, 2)


## It is observed that both _df_CNN_ and _df_RNN_ are identical, and _df_RF_ is different with them. 
#### Therefore let's save df_RNN and df_CNN_RF as only two variants of the processed datasets (on original iDetect dataset). 

In [287]:
filter_dup_rows(df_RNN).to_csv(SAVE_DIR + 'DNN_' + file_part, index=False)
filter_dup_rows(df_RNN_RF).to_csv(SAVE_DIR + 'RF_' + file_part, index=False)