In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import datetime as dt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, auc, precision_recall_curve

In [2]:
def df_key(x,y,z):
    return str('o'+str(int(x))+'t'+str(int(y))+'f'+str(int(z)))

In [3]:
# Import observation data

observation_file = "/Users/eormacstudio/Documents/winsorized_statistics/python_script/general_eor1_update_241010.csv"

df = pd.read_csv(observation_file, header=0, engine='python')
df = df[['obs_id', 'groupid', 'starttime_utc', 'local_sidereal_time_deg', 'duration',
        'int_time', 'freq_res', 'dataqualityname', 'bad_tiles', 'calibration',
        'calibration_delays', 'center_frequency_mhz', 'channel_center_frequencies_mhz_csv',
        'ra', 'ra_pointing', 'ra_phase_center', 'dec', 'dec_pointing', 'dec_phase_center',
        'deleted_flag', 'good_tiles', 'mode', 'sky_temp', 'stoptime_utc', 'total_tiles', 'gridpoint_name', 'gridpoint_number']]
df['date'] = df.starttime_utc.apply(lambda x: dt.datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.000Z").date())
df['date_time'] = df.starttime_utc.apply(lambda x: dt.datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.000Z"))
df['partition'] = pd.factorize(df['date'])[0] + 1
df = df[(df['partition'] == 1)].reset_index()

obs_list = df['obs_id'].to_list()

number_of_days = np.unique(df['partition'])
gn = np.unique(df['gridpoint_number'])

In [4]:
results_directory = "/Volumes/eor_hdd_2/exponential_weighting/exponential_weighting_results/model/"
added_rfi_directory = "/Users/eormacstudio/Documents/winsorized_statistics/winsorizing_model_visibility/higher_thermal_noise/same_level_noise/"

polar = ['XX', 'YY']
sn = ["sn1", "sn2", "sn3", "sn4", "sn5"]
data_type = ["real", "amplitude", "imaginary"]
threshold_type = ['negative', 'positive', 'combine']
integration = [2, 8]

file_name = ["all_z_scores_data_day_1_grid_0_integration_2_amplitude.parquet"]

number_of_tile = 128
number_of_frequency = 768

time_convert = np.repeat(np.arange(0,14), 4)
tile_list = np.arange(0, 128, 1)
z_score = np.arange(3, 9, 1)

In [5]:
level = ['901.59', '1298.45', '1620.04', '1906.17', '2176.54']
df_rfi_level = pd.DataFrame({'sn' : sn, 'level' : level})

---

#### Combine all

In [None]:
df_stats = pd.DataFrame()

for i in range(len(data_type)):
    for j in range(len(sn)):
        for k in range(len(polar)):
            for l in range(len(integration)):
                print(data_type[i], sn[j], polar[k], integration[l])
                
                df_added_rfi = pd.read_csv(added_rfi_directory + "%s/add_rfi_all_obs_location_multiple_rfi_fix_thermal_%s.txt" %(sn[0], sn[0]))
                df_added_rfi['pol_1'] = df_added_rfi['pol_1'].apply(lambda x: str(x).replace('[', '').replace(']', ''))
                df_added_rfi['pol_1'] = df_added_rfi['pol_1'].astype(int)
                df_added_rfi = df_added_rfi[df_added_rfi['pol_1'] == k].reset_index(drop=True)

                if integration[l] == 2:
                    df_added_rfi['rfi_key'] = df_added_rfi.apply(lambda x: df_key(x['obs_id'], x['timeblock'], x['channel']),  axis=1)
                    df_added_rfi = df_added_rfi.drop_duplicates(subset=['rfi_key'], keep='last').reset_index(drop=True)
                    df_added_rfi['alias'] = 1
                elif integration[l] == 8:
                    df_added_rfi['timeblock_alias'] = df_added_rfi['timeblock'].apply(lambda x: time_convert[x])
                    df_added_rfi['rfi_key'] = df_added_rfi.apply(lambda x: df_key(x['obs_id'], x['timeblock_alias'], x['channel']),  axis=1)
                    df_added_rfi = df_added_rfi.drop_duplicates(subset=['rfi_key'], keep='last').reset_index(drop=True)
                    df_added_rfi['alias'] = 1

                #print("df_added_rfi: ", df_added_rfi)

                df_so = pd.DataFrame()
                for o in range(len(gn)):
                    aa = pd.read_parquet(results_directory + "%s/%ss/all_z_scores_data_day_%s_grid_%s_integration_%s_%s.parquet" %(sn[j], integration[l], number_of_days[0], gn[o], integration[l], data_type[i]))
                    aa = aa.explode(["data", "z_score", "obs_id"]).reset_index(drop=True)
                    aa = aa[aa['polarization'].isin([k])].reset_index(drop=True)

                    if integration[l] == 2:
                        aa['timeblock'] = np.tile(np.arange(0,56,1), number_of_tile*number_of_frequency*len(df[(df['partition'] == number_of_days[0]) & (df['gridpoint_number'] == gn[o])]))
                    elif integration[l] == 8:
                        aa['timeblock'] = np.tile(np.arange(0,14,1), number_of_tile*number_of_frequency*len(df[(df['partition'] == number_of_days[0]) & (df['gridpoint_number'] == gn[o])]))

                    aa = aa[~aa['z_score'].isna()].reset_index(drop=True)
                    aa['z_score'] = aa['z_score'].astype(float)
                    df_so = pd.concat([df_so, aa]).reset_index(drop = True)

                #print("df_so: ", df_so)

                for m in range(len(z_score)):
                    print('running for z-score: ', z_score[m])
                    for n in range(len(threshold_type)):
                        print(threshold_type[n])
                        
                        # Reduce df_so
                        if threshold_type[n] == 'positive':
                            ss = df_so[(df_so['z_score'] > z_score[m])].reset_index(drop=True)
                        elif threshold_type[n] == 'negative':
                            ss = df_so[(df_so['z_score'] < -z_score[m])].reset_index(drop=True)
                        elif threshold_type[n] == 'combine':
                            ss = df_so[(df_so['z_score'] < -z_score[m]) | (df_so['z_score'] > z_score[m])].reset_index(drop=True)

                        if len(ss) != 0:
                            ss['rfi_key'] = ss.apply(lambda x: df_key(x['obs_id'], x['timeblock'], x['frequency']),  axis=1)

                            ss = ss.drop_duplicates(subset=['rfi_key'], keep='last').reset_index(drop=True)
                            ss['alias'] = 1

                            #print('ss: ', ss)
                            
                            if integration[l] == 2:
                                # Create default dataframe
                                # Step 1: Create the data

                                data_A = np.arange(0, 56)
                                data_B = np.arange(0, 768)

                                # Step 2: Generate the Cartesian product

                                column_A = np.repeat(obs_list, len(data_A) * len(data_B))
                                column_B = np.tile(np.repeat(data_A, len(data_B)), len(obs_list))
                                column_C = np.tile(data_B, len(obs_list) * len(data_A))

                                # Step 3: Construct the DataFrame

                                df_default = pd.DataFrame({
                                    'obs_id': column_A,
                                    'timeblock': column_B,
                                    'frequency': column_C
                                })

                            elif integration[l] == 8:
                                # Create default dataframe
                                # Step 1: Create the data

                                data_A = np.arange(0, 14)
                                data_B = np.arange(0, 768)

                                # Step 2: Generate the Cartesian product

                                column_A = np.repeat(obs_list, len(data_A) * len(data_B))
                                column_B = np.tile(np.repeat(data_A, len(data_B)), len(obs_list))
                                column_C = np.tile(data_B, len(obs_list) * len(data_A))

                                # Step 3: Construct the DataFrame

                                df_default = pd.DataFrame({
                                    'obs_id': column_A,
                                    'timeblock': column_B,
                                    'frequency': column_C
                                })

                            df_default['rfi_key'] = df_default.apply(lambda x: df_key(x['obs_id'], x['timeblock'], x['frequency']),  axis=1)
                            df_default['added_rfi'] = df_default.rfi_key.map(df_added_rfi.set_index('rfi_key')['alias'])
                            df_default['detected_rfi'] = df_default.rfi_key.map(ss.set_index('rfi_key')['alias'])
                            df_default = df_default.fillna(0)

                            #print("Default data: ", df_default)

                            # Calculate all metrics
                            cm = confusion_matrix(df_default['added_rfi'], df_default['detected_rfi'])

                            accuracy = accuracy_score(df_default['added_rfi'], df_default['detected_rfi'])
                            precision = precision_score(df_default['added_rfi'], df_default['detected_rfi'])
                            recall = recall_score(df_default['added_rfi'], df_default['detected_rfi'])
                            f1 = f1_score(df_default['added_rfi'], df_default['detected_rfi'])

                            TN = cm[0,0]
                            FP = cm[0,1]
                            FN = cm[1,0]
                            TP = cm[1,1]

                            fsdata = {'data_type': [data_type[i]],
                                    'sn': [sn[j]],
                                    'polarization': [polar[k]],
                                    'total_rfi': [len(df_added_rfi)],
                                    'integration': [integration[l]],
                                    'z_score': [z_score[m]],
                                    'threshold_type': [threshold_type[n]],
                                    'accuracy': [accuracy],
                                    'precision': [precision],
                                    'recall': [recall],
                                    'f1': [f1],
                                    'tp': [TP],
                                    'fp': [FP],
                                    'tn': [TN],
                                    'fn': [FN]}
                            
                            # Create the DataFrame
                            aa = pd.DataFrame(fsdata)
                            df_stats = pd.concat([df_stats, aa]).reset_index(drop = True)

df_stats['tp_rate'] = df_stats['tp']/df_stats['total_rfi']
df_stats['fp_rate'] = df_stats['fp']/(df_stats['fp'] + df_stats['tn'])

real sn1 XX 2
running for z-score:  3
negative
positive
combine
running for z-score:  4
negative
positive
combine
running for z-score:  5
negative
positive
combine
running for z-score:  6
negative
positive
combine
running for z-score:  7
negative
positive
combine
running for z-score:  8
negative
positive
combine
real sn1 XX 8
running for z-score:  3
negative
positive
combine
running for z-score:  4
negative
positive
combine
running for z-score:  5
negative
positive
combine
running for z-score:  6
negative
positive
combine
running for z-score:  7
negative
positive
combine
running for z-score:  8
negative
positive
combine
real sn1 YY 2
running for z-score:  3
negative
positive
combine
running for z-score:  4
negative
positive
combine
running for z-score:  5
negative
positive
combine
running for z-score:  6
negative
positive
combine
running for z-score:  7
negative
positive
combine
running for z-score:  8
negative
positive
combine
real sn1 YY 8
running for z-score:  3
negative
positive
co

In [None]:
df_stats[(df_stats['data_type'] == 'real') & (df_stats['threshold_type'] == 'negative')]

---

#### Z-score data

In [7]:
df_z_score = pd.read_parquet(results_directory + "%s/%ss/all_z_scores_data_day_%s_grid_%s_integration_%s_%s.parquet" %(sn[0], integration[1], number_of_days[0], gn[0], integration[1], data_type[2]))

df_z_score = df_z_score.explode(["data", "z_score", "obs_id"]).reset_index(drop=True)
df_z_score = df_z_score[df_z_score['polarization'].isin([0,1])].reset_index(drop=True)
df_z_score['timeblock'] = np.tile(np.arange(0,14,1), number_of_tile*number_of_frequency*len(polar)*len(df[(df['partition'] == number_of_days[0]) & (df['gridpoint_number'] == gn[0])]))

In [11]:
df_z_5 = df_z_score[(df_z_score['z_score'] < -5) | (df_z_score['z_score'] > 5)].reset_index(drop=True)
df_z_5['rfi_key'] = df_z_5.apply(lambda x: df_key(x['obs_id'], x['timeblock'], x['frequency']),  axis=1)
df_z_5 = df_z_5.drop_duplicates(subset=['rfi_key'], keep='last').reset_index(drop=True)
df_z_5['alias'] = 1

In [None]:
df_z_5[df_z_5['z_score'] > 0]

In [None]:
df_z_5

---

#### Added RFI data

In [15]:
df_added_rfi = pd.read_csv(added_rfi_directory + "%s/add_rfi_all_obs_location_multiple_rfi_fix_thermal_%s.txt" %(sn[0], sn[0]))
df_added_rfi['pol_1'] = df_added_rfi['pol_1'].apply(lambda x: str(x).replace('[', '').replace(']', ''))
df_added_rfi['pol_1'] = df_added_rfi['pol_1'].astype(int)
df_added_rfi['rfi_key'] = df_added_rfi.apply(lambda x: df_key(x['obs_id'], x['timeblock'], x['channel']),  axis=1)
df_added_rfi = df_added_rfi.drop_duplicates(subset=['rfi_key'], keep='last').reset_index(drop=True)
df_added_rfi['alias'] = 1

In [None]:
df_added_rfi

---

#### Default dataframe

In [21]:
# Create default dataframe
# Step 1: Create the data

data_A = np.arange(0, 56)
data_B = np.arange(0, 768)

# Step 2: Generate the Cartesian product

column_A = np.repeat(obs_list, len(data_A) * len(data_B))
column_B = np.tile(np.repeat(data_A, len(data_B)), len(obs_list))
column_C = np.tile(data_B, len(obs_list) * len(data_A))

# Step 3: Construct the DataFrame

df = pd.DataFrame({
'obs_id': column_A,
'timeblocks': column_B,
'freqs': column_C
})

df['rfi_key'] = df.apply(lambda x: df_key(x['obs_id'], x['timeblocks'], x['freqs']),  axis=1)
df['added_rfi'] = df.rfi_key.map(df_added_rfi.set_index('rfi_key')['alias'])
df['detected_rfi'] = df.rfi_key.map(df_z_5.set_index('rfi_key')['alias'])
df = df.fillna(0)

In [23]:
 # Calculate all metrics
cm = confusion_matrix(df['added_rfi'], df['detected_rfi'])

accuracy = accuracy_score(df['added_rfi'], df['detected_rfi'])
precision = precision_score(df['added_rfi'], df['detected_rfi'])
recall = recall_score(df['added_rfi'], df['detected_rfi'])
f1 = f1_score(df['added_rfi'], df['detected_rfi'])

TN = cm[0,0]
FP = cm[0,1]
FN = cm[1,0]
TP = cm[1,1]

In [None]:
print("accuracy ", accuracy)
print("precision ", precision)
print("recall ", recall)
print("f1 ", f1)
print("TN ", TN)
print("FP ", FP)
print("FN ", FN)
print("TP ", TP)

In [None]:
print("accuracy ", accuracy)
print("precision ", precision)
print("recall ", recall)
print("f1 ", f1)
print("TN ", TN)
print("FP ", FP)
print("FN ", FN)
print("TP ", TP)

In [None]:
print("accuracy ", accuracy)
print("precision ", precision)
print("recall ", recall)
print("f1 ", f1)
print("TN ", TN)
print("FP ", FP)
print("FN ", FN)
print("TP ", TP)