# Build dataset from GOES flares & raw HEK event files

Import relevant modules

In [85]:
import pandas as pd
import numpy as np
from shapely import wkt
from sunpy.time import parse_time
import datetime
from shapely.geometry import Polygon, Point 
# from sunpy.physics.differential_rotation import rot_hpc
from astropy import units as u
from shapely import wkt
import os
import math
import scipy.stats as ss
import scipy
import matplotlib.pyplot as plt


Find Better Peak and Start Times

In [None]:
full = pd.read_csv('prepped_GOES4.csv', delimiter=',', header=0)
#create a column to indicate via binary whether event has an associated lightcurve
full.loc[:,'associated_lc'] = [0 for i in range(full.shape[0])]
sols = full['SOL_standard']
lcs = []
for root, dirs, files in os.walk('goes_event_lcs2'):
    for f in files:
        if f[0] != '.':
            lcs.append(f[:-4])
j=0
for sol in sols:
    notfound=True
    for f in lcs:
        if notfound:
            if sol==f:
                full['associated_lc'].values[j] = 1
                notfound = False
    j+=1      
full.to_csv('prepped_GOES4_determinelc.csv', index=False)

In [None]:
def eliminate_noise_graph(t, x, nu):
    x = np.array(x)
    foTran = np.fft.fft(x)
    for i in range(len(foTran)):  
        if i>nu:
            foTran[i]=0
    ifoTran = np.fft.ifft(foTran)
#     plt.figure(figsize = (10, 7))
#     plt.plot(t, ifoTran, color = 'blue', marker = 'o')
#     plt.plot(t, x, 'r', marker = 'o')
#     plt.show()
    return foTran, ifoTran

In [78]:
def adjust_times(inputF, outputF):
    dateparseT = lambda x: pd.datetime.strptime(x, '%Y-%m-%dT%H:%M:%S')
    dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')

    full = pd.read_csv(inputF, delimiter=',', header=0, 
                         parse_dates=['event_starttime', 'event_endtime'],date_parser=dateparseT)
    full = full.ix[full['associated_lc']==1]
#     full = full.ix[full['duration_sec']>=0]
    length = full.shape[0]
    full.loc[:,'adjusted_starttime'] = ['%Y-%m-%dT%H:%M:%S' for x in range(length)]
    full.loc[:,'adjusted_peaktime'] = ['%Y-%m-%dT%H:%M:%S' for x in range(length)]
    full.loc[:,'adjusted_peak'] = [0 for x in range(length)]
    j=0
    for sol in full['SOL_standard']:
        fname = 'goes_event_lcs2/'+str(sol)+'.csv'
        lc = pd.read_csv(fname, delimiter = ',', header = 0, parse_dates=['date'],
                         date_parser=dateparse)
        lc = lc.ix[lc['date']<=(full['event_endtime'].values[j]+np.timedelta64(1, 'm'))]
        lc['xrsb'] = lc['xrsb']*10**11
        if lc.shape[0]==0: full['associated_lc'].values[j] = 1
        else:
            peak = np.argmax(np.array(lc['xrsb']))
            adjusted_peak = lc['xrsb'].values[peak]
            adjusted_peaktime = lc['date'].values[peak]
            full['adjusted_peaktime'].values[j] = adjusted_peaktime
            full['adjusted_peak'].values[j] = adjusted_peak
            lc = lc.ix[lc['date']<=adjusted_peaktime]
            minutesX = ((lc['date'] - lc['date'].values[0])*10**(-9))
            minutesX = map(lambda x: x/np.timedelta64(1, 'ns'), minutesX)
            minutesXog = minutesX
            duration = minutesX[-1]
            maxi = np.argmax(np.array(lc['xrsb'])) 
            softx = lc['xrsb'].values[0:maxi+1]
            if len(softx)<=7:full['associated_lc'].values[j] = 1
            else:
                minutesX = minutesX[0:maxi+1]
                searchzone=True
                endsearchidx = maxi
                for k, x in enumerate(softx):
                    if searchzone:
                        if (x-softx[0])/(softx[-1]-softx[0]) > 0.31053736899:
                            searchzone=False
                            endsearchidx = k
                softx = softx[:endsearchidx+6]           
                minutesX = minutesX[0:endsearchidx+6]
                i=0
                derivatives = []
                for time in minutesX[:-6]:
                    delta_t = minutesX[i+5]-time
                    delta_flux = softx[i+5]-softx[i]
                    derivative = delta_flux/delta_t
                    derivatives.append(derivative)
                    i+=1
                der2s = []
                i=0
                for time in minutesX[:-7]:
                    delta_t = minutesX[i+5]-time
                    delta_der = derivatives[i+1]-derivatives[i]
                    der2 = delta_der/delta_t
                    der2s.append(der2)
                    i+=1 
                foTran4, ifoTran4 = eliminate_noise_graph(minutesX[:-7], der2s, 5)
                maxidx4 = np.argmax(np.array(ifoTran4)) 
                full['adjusted_starttime'].values[j] = lc['date'].values[maxidx4]

        j+=1
#     full['adjusted_starttime'] = map(parse_time, full['adjusted_starttime'])
#     full['adjusted_peaktime'] = map(parse_time, full['adjusted_peaktime'])
    full.to_csv(outputF, index=False, date_format = '%Y-%m-%dT%X')

In [32]:
#correct for events without lightcurves 
def correct_for_bad_lcs(inputf,outputf):
    f = pd.read_csv(inputf, header=0, delimiter=',')

    j = 0
    for time in f['adjusted_starttime']:
        if time=='%Y-%m-%dT%H:%M:%S':
            f['associated_lc'].values[j] =0
            f['adjusted_starttime'].values[j] = f['event_starttime'].values[j]+'.000000000'
        j+=1
    j=0
    for time in f['adjusted_peaktime']:
        if time=='%Y-%m-%dT%H:%M:%S':
            f['associated_lc'].values[j] =0
            f['adjusted_peaktime'].values[j] = f['event_peaktime'].values[j]+'.000000000'
        j+=1
    j=0
    for peak in f['adjusted_peak']:
        if peak==0.0:
            f['associated_lc'].values[j] = 0
            f['adjusted_peak'].values[j] = f['fl_peakflux_goes'].values[j]
        j+=1
    f.to_csv(outputf, index=False)


Associate AIA events

In [94]:
def associate_AIA(inputFile_aia, inputFile_goes, temp_sep, spatial_sep, output2file=False, out_file = None):
    dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%dT%H:%M:%S')
    dateparse0 = lambda x: pd.datetime.strptime(x, '%Y-%m-%dT%H:%M:%S.000000000')
    
    #import a record of flare events as a DataFrame
    aia = pd.read_csv(inputFile_aia, delimiter = ',', header = 0, parse_dates=['event_starttime', 'event_endtime', 
                                                                               'event_peaktime'], 
                      date_parser=dateparse)
    start_date = datetime.datetime(2011, 3, 1)
    aia = aia.ix[aia['event_starttime']>=start_date]
    #import a record of GOES events as a DataFrame
    goes = pd.read_csv(inputFile_goes, delimiter = ',', header = 0, parse_dates=['adjusted_starttime', 
                                                                                 'adjusted_peaktime'], 
                      date_parser=dateparse0)
#     goes['adjusted_starttime'] = map(str, goes['adjusted_starttime'])
#     goes['adjusted_starttime'] = map(parse_time, goes['adjusted_starttime'])
    #set solar radius
    r = 966
    #how many x-ray flare events working with
    length = goes.shape[0]
    #list of zeroes with length of number of flare events
    zeroesL = ['0000000000000000000' for i in range(length)]
    zeroes = [0 for i in range(length)]
    #list of nulls with length of number of flare events
    nones = [None for i in range(length)]
    #create columns (filled with zeroes) for tracking associated events
    aia = aia.rename(columns={'SOL_standard':'AIA_SOL_standard','event_starttime':'starttime_aia', 
                                        'event_endtime':'endtime_aia', 'event_peaktime':'peaktime_aia',
                                        'hpc_x':'hpc_x_aia', 'hpc_y':'hpc_y_aia'})
    aia_keywords = np.genfromtxt('keywords2_AIA.csv', delimiter=',', dtype=str)
    aia_keywords_none = np.genfromtxt('keywords2_aia_none.csv', delimiter=',', dtype=str)
    aia_keywords_L = ['associated_131','associated_171','associated_193','associated_211','associated_304',
                      'associated_335','associated_94']
    #create columns filled with 0s/Nones for each GOES keyword
    for elem in aia_keywords:
         goes.loc[:, elem] = zeroes
    for elem in aia_keywords_L:
        goes.loc[:, elem] = zeroesL
    for elem in aia_keywords_none:
        goes.loc[:, elem] = nones
#     goes['peaktime_aia'] = zeroesL
#     goes['starttime_aia']=zeroesL
#     goes['endtime_aia']=zeroesL
    #convert start and end times to datetime objects
#     goes['event_starttime'] = map(parse_time, goes['event_starttime'])
    start_set = goes['adjusted_starttime'] - datetime.timedelta(minutes = temp_sep)
    end_set = goes['adjusted_starttime'] + datetime.timedelta(minutes = temp_sep)
#     goes['event_endtime'] = map(parse_time, goes['event_endtime'])
#     aia['starttime_goes'] = map(parse_time, aia['starttime_aia'])
#     aia['endtime_goes'] = map(parse_time, aia['endtime_aia'])

    #set positional row index 
    i = -1
    for annoying_wrong_obj_type in goes['adjusted_starttime']:
        i += 1
        #print which flare event function is currently processing, so the user has an idea of how much longer
        #program will need to run
        start_start = start_set[i]
        start_end = end_set[i]
        if (i+1)%10 == 0:
            print '%d / %d events' %((i+1), length)
        #begin eliminating AIA events based on temporal parameters
        aia_search = aia.ix[aia['starttime_aia']>=start_start]
        aia_search = aia_search.ix[aia_search['starttime_aia']<=start_end]
        #as long as the temporal search does not eliminate all possible related AIA events, proceed
        if aia_search.empty==False:
                goes_point = Point((goes['hpc_x'].values[i], goes['hpc_y'].values[i]))
                min_s = spatial_sep
                event_index = None
                found_aia = False
                for j in range(aia_search.shape[0]):
                    #create a point object from AIA flare's location
                    aia_point = Point((aia_search['hpc_x_aia'].values[j], aia_search['hpc_y_aia'].values[j]))
                    #calculate the minimum 2D distance between the AIA flare's and GOES flare's mean coordinates
                    chord = goes_point.distance(aia_point)
                    #calculate the minimum 3D distance along the sun's curved surface between the flare events
                    #assumes the same radius for all events
                    s = r*np.arcsin(chord/r)
                    #determine whether the spatial distance between AIA and GOES flares meets the set parameter
                    if s <= spatial_sep:
                        #have found an associated GOES flare, record relevant info
                        goes = determine_wavelengths(goes, aia_search, i, j)
                goes.loc[i, 'sum_peakflux'] = (goes.loc[i, 'fl_peakflux_131']+ 
                                               goes.loc[i, 'fl_peakflux_171']+
                                               goes.loc[i, 'fl_peakflux_193']+
                                               goes.loc[i, 'fl_peakflux_211']+
                                               goes.loc[i, 'fl_peakflux_304']+
                                               goes.loc[i, 'fl_peakflux_335']+
                                               goes.loc[i, 'fl_peakflux_94'])

    #write dataframe to a csv file depending on initial parameters
    if output2file:
        if out_file == None:
            #create a generic name for file based on search parameters if no file name specified
             out_file = inputFile_fl[0:-4]+'_with_AIA.csv'
        #write to csv
        goes.to_csv(path_or_buf=out_file, index = False, date_format = '%Y-%m-%dT%X')
        
    return goes

In [58]:
def determine_wavelengths(df, df2, index, index2):

    if df2['obs_channelid'].values[index2] == 131:
        if df['is_131'].values[index] == 0:
            df['is_131'].values[index] = 1
            df['associated_131'].values[index] = df2['AIA_SOL_standard'].values[index2]
            df['fl_peakflux_131'].values[index] = df2['fl_peakflux'].values[index2]
            df['starttime_131'].values[index] = df2['starttime_aia'].values[index2]
            df['peaktime_131'].values[index] = df2['peaktime_aia'].values[index2]
            df['endtime_131'].values[index] = df2['endtime_aia'].values[index2]
            df['hpc_x_131'].values[index] = df2['hpc_x_aia'].values[index2]
            df['hpc_y_131'].values[index] = df2['hpc_y_aia'].values[index2]
        elif df['fl_peakflux_131'].values[index] < df2['fl_peakflux'].values[index2]:
            df['associated_131'].values[index] = df2['AIA_SOL_standard'].values[index2]
            df['fl_peakflux_131'].values[index] = df2['fl_peakflux'].values[index2]
            df['starttime_131'].values[index] = df2['starttime_aia'].values[index2]
            df['peaktime_131'].values[index] = df2['peaktime_aia'].values[index2]
            df['endtime_131'].values[index] = df2['endtime_aia'].values[index2]
            df['hpc_x_131'].values[index] = df2['hpc_x_aia'].values[index2]
            df['hpc_y_131'].values[index] = df2['hpc_y_aia'].values[index2] 
                
    if df2['obs_channelid'].values[index2] == 171:
        if df['is_171'].values[index] == 0:
            df['is_171'].values[index] = 1
            df['associated_171'].values[index] = df2['AIA_SOL_standard'].values[index2]
            df['fl_peakflux_171'].values[index] = df2['fl_peakflux'].values[index2]
            df['starttime_171'].values[index] = df2['starttime_aia'].values[index2]
            df['peaktime_171'].values[index] = df2['peaktime_aia'].values[index2]
            df['endtime_171'].values[index] = df2['endtime_aia'].values[index2]
            df['hpc_x_171'].values[index] = df2['hpc_x_aia'].values[index2]
            df['hpc_y_171'].values[index] = df2['hpc_y_aia'].values[index2]
        elif df['fl_peakflux_171'].values[index] < df2['fl_peakflux'].values[index2]:
            df['associated_171'].values[index] = df2['AIA_SOL_standard'].values[index2]
            df['fl_peakflux_171'].values[index] = df2['fl_peakflux'].values[index2]
            df['starttime_171'].values[index] = df2['starttime_aia'].values[index2]
            df['peaktime_171'].values[index] = df2['peaktime_aia'].values[index2]
            df['endtime_171'].values[index] = df2['endtime_aia'].values[index2]
            df['hpc_x_171'].values[index] = df2['hpc_x_aia'].values[index2]
            df['hpc_y_171'].values[index] = df2['hpc_y_aia'].values[index2]
            

    if df2['obs_channelid'].values[index2] == 193:
        if df['is_193'].values[index] == 0:
            df['is_193'].values[index] = 1
            df['associated_193'].values[index] = df2['AIA_SOL_standard'].values[index2]
            df['fl_peakflux_193'].values[index] = df2['fl_peakflux'].values[index2]
            df['starttime_193'].values[index] = df2['starttime_aia'].values[index2]
            df['peaktime_193'].values[index] = df2['peaktime_aia'].values[index2]
            df['endtime_193'].values[index] = df2['endtime_aia'].values[index2]
            df['hpc_x_193'].values[index] = df2['hpc_x_aia'].values[index2]
            df['hpc_y_193'].values[index] = df2['hpc_y_aia'].values[index2]
        elif df['fl_peakflux_193'].values[index] < df2['fl_peakflux'].values[index2]:
            df['associated_193'].values[index] = df2['AIA_SOL_standard'].values[index2]
            df['fl_peakflux_193'].values[index] = df2['fl_peakflux'].values[index2]
            df['starttime_193'].values[index] = df2['starttime_aia'].values[index2]
            df['peaktime_193'].values[index] = df2['peaktime_aia'].values[index2]
            df['endtime_193'].values[index] = df2['endtime_aia'].values[index2]
            df['hpc_x_193'].values[index] = df2['hpc_x_aia'].values[index2]
            df['hpc_y_193'].values[index] = df2['hpc_y_aia'].values[index2]

    if df2['obs_channelid'].values[index2] == 211:
        if df['is_211'].values[index] == 0:
            df['is_211'].values[index] = 1
            df['associated_211'].values[index] = df2['AIA_SOL_standard'].values[index2]
            df['fl_peakflux_211'].values[index] = df2['fl_peakflux'].values[index2]
            df['starttime_211'].values[index] = df2['starttime_aia'].values[index2]
            df['peaktime_211'].values[index] = df2['peaktime_aia'].values[index2]
            df['endtime_211'].values[index] = df2['endtime_aia'].values[index2]
            df['hpc_x_211'].values[index] = df2['hpc_x_aia'].values[index2]
            df['hpc_y_211'].values[index] = df2['hpc_y_aia'].values[index2]
        elif df['fl_peakflux_211'].values[index] < df2['fl_peakflux'].values[index2]:
            df['associated_211'].values[index] = df2['AIA_SOL_standard'].values[index2]
            df['fl_peakflux_211'].values[index] = df2['fl_peakflux'].values[index2]
            df['starttime_211'].values[index] = df2['starttime_aia'].values[index2]
            df['peaktime_211'].values[index] = df2['peaktime_aia'].values[index2]
            df['endtime_211'].values[index] = df2['endtime_aia'].values[index2]
            df['hpc_x_211'].values[index] = df2['hpc_x_aia'].values[index2]
            df['hpc_y_211'].values[index] = df2['hpc_y_aia'].values[index2]

    if df2['obs_channelid'].values[index2] == 304:
        if df['is_304'].values[index] == 0:
            df['is_304'].values[index] = 1
            df['associated_304'].values[index] = df2['AIA_SOL_standard'].values[index2]
            df['fl_peakflux_304'].values[index] = df2['fl_peakflux'].values[index2]
            df['starttime_304'].values[index] = df2['starttime_aia'].values[index2]
            df['peaktime_304'].values[index] = df2['peaktime_aia'].values[index2]
            df['endtime_304'].values[index] = df2['endtime_aia'].values[index2]
            df['hpc_x_304'].values[index] = df2['hpc_x_aia'].values[index2]
            df['hpc_y_304'].values[index] = df2['hpc_y_aia'].values[index2]
        elif df['fl_peakflux_304'].values[index] < df2['fl_peakflux'].values[index2]:
            df['associated_304'].values[index] = df2['AIA_SOL_standard'].values[index2]
            df['fl_peakflux_304'].values[index] = df2['fl_peakflux'].values[index2]
            df['starttime_304'].values[index] = df2['starttime_aia'].values[index2]
            df['peaktime_304'].values[index] = df2['peaktime_aia'].values[index2]
            df['endtime_304'].values[index] = df2['endtime_aia'].values[index2]
            df['hpc_x_304'].values[index] = df2['hpc_x_aia'].values[index2]
            df['hpc_y_304'].values[index] = df2['hpc_y_aia'].values[index2]

    if df2['obs_channelid'].values[index2] == 335:
        if df['is_335'].values[index] == 0:
            df['is_335'].values[index] = 1
            df['associated_335'].values[index] = df2['AIA_SOL_standard'].values[index2]
            df['fl_peakflux_335'].values[index] = df2['fl_peakflux'].values[index2]
            df['starttime_335'].values[index] = df2['starttime_aia'].values[index2]
            df['peaktime_335'].values[index] = df2['peaktime_aia'].values[index2]
            df['endtime_335'].values[index] = df2['endtime_aia'].values[index2]
            df['hpc_x_335'].values[index] = df2['hpc_x_aia'].values[index2]
            df['hpc_y_335'].values[index] = df2['hpc_y_aia'].values[index2]
        elif df['fl_peakflux_335'].values[index] < df2['fl_peakflux'].values[index2]:
            df['associated_335'].values[index] = df2['AIA_SOL_standard'].values[index2]
            df['fl_peakflux_335'].values[index] = df2['fl_peakflux'].values[index2]
            df['starttime_335'].values[index] = df2['starttime_aia'].values[index2]
            df['peaktime_335'].values[index] = df2['peaktime_aia'].values[index2]
            df['endtime_335'].values[index] = df2['endtime_aia'].values[index2]
            df['hpc_x_335'].values[index] = df2['hpc_x_aia'].values[index2]
            df['hpc_y_335'].values[index] = df2['hpc_y_aia'].values[index2]

    if df2['obs_channelid'].values[index2] == 94:
        if df['is_94'].values[index] == 0:
            df['is_94'].values[index] = 1
            df['associated_94'].values[index] = df2['AIA_SOL_standard'].values[index2]
            df['fl_peakflux_94'].values[index] = df2['fl_peakflux'].values[index2]
            df['starttime_94'].values[index] = df2['starttime_aia'].values[index2]
            df['peaktime_94'].values[index] = df2['peaktime_aia'].values[index2]
            df['endtime_94'].values[index] = df2['endtime_aia'].values[index2]
            df['hpc_x_94'].values[index] = df2['hpc_x_aia'].values[index2]
            df['hpc_y_94'].values[index] = df2['hpc_y_aia'].values[index2]
        elif df['fl_peakflux_94'].values[index] < df2['fl_peakflux'].values[index2]:
            df['associated_94'].values[index] = df2['AIA_SOL_standard'].values[index2]
            df['fl_peakflux_94'].values[index] = df2['fl_peakflux'].values[index2]
            df['starttime_94'].values[index] = df2['starttime_aia'].values[index2]
            df['peaktime_94'].values[index] = df2['peaktime_aia'].values[index2]
            df['endtime_94'].values[index] = df2['endtime_aia'].values[index2]
            df['hpc_x_94'].values[index] = df2['hpc_x_aia'].values[index2]
            df['hpc_y_94'].values[index] = df2['hpc_y_aia'].values[index2]
        
    return df

In [110]:
associate_AIA('raw_detective.csv', 'prepped_GOES7_adjustedtimes.csv', 7, 200, output2file=True, 
              out_file = 'goes_aia7.csv')

10 / 7493 events
20 / 7493 events
30 / 7493 events
40 / 7493 events
50 / 7493 events
60 / 7493 events
70 / 7493 events
80 / 7493 events
90 / 7493 events
100 / 7493 events
110 / 7493 events
120 / 7493 events
130 / 7493 events
140 / 7493 events
150 / 7493 events
160 / 7493 events
170 / 7493 events
180 / 7493 events
190 / 7493 events
200 / 7493 events
210 / 7493 events
220 / 7493 events
230 / 7493 events
240 / 7493 events
250 / 7493 events
260 / 7493 events
270 / 7493 events
280 / 7493 events
290 / 7493 events
300 / 7493 events
310 / 7493 events
320 / 7493 events
330 / 7493 events
340 / 7493 events
350 / 7493 events
360 / 7493 events
370 / 7493 events
380 / 7493 events
390 / 7493 events
400 / 7493 events
410 / 7493 events
420 / 7493 events
430 / 7493 events
440 / 7493 events
450 / 7493 events
460 / 7493 events
470 / 7493 events
480 / 7493 events
490 / 7493 events
500 / 7493 events
510 / 7493 events
520 / 7493 events
530 / 7493 events
540 / 7493 events
550 / 7493 events
560 / 7493 events
5

Unnamed: 0,SOL_standard,event_starttime,event_endtime,event_peaktime,fl_goescls,hpc_bbox,hpc_radius,hpc_x,hpc_y,ar_noaanum,...,endtime_94,hpc_x_94,hpc_y_94,associated_131,associated_171,associated_193,associated_211,associated_304,associated_335,associated_94
0,SOL2011-03-01T14:35:00L160C064,2011-03-01T14:35:00,2011-03-01T14:40:00,2011-03-01T14:38:00,C1.1,"POLYGON((16.96746 138.9924,16.96746 138.9924,1...",676.817633,-436.41540,517.323600,1164,...,,,,0000000000000000000,SOL2011-03-01T14:35:13L161C066,SOL2011-03-01T14:35:20L161C066,SOL2011-03-01T14:35:26L161C066,SOL2011-03-01T14:35:21L161C066,SOL2011-03-01T14:35:29L161C066,0000000000000000000
1,SOL2011-03-01T17:42:00L167C066,2011-03-01T17:42:00,2011-03-01T17:54:00,2011-03-01T17:47:00,C1.3,"POLYGON((16.96692 139.0128,16.96692 139.0128,1...",589.308377,-318.03480,496.123200,1164,...,,,,0000000000000000000,SOL2011-03-01T17:43:49L160C066,SOL2011-03-01T17:43:56L165C066,SOL2011-03-01T17:43:50L160C066,SOL2011-03-01T17:43:45L165C066,0000000000000000000,0000000000000000000
2,SOL2011-03-01T18:06:00L154C062,2011-03-01T18:06:00,2011-03-01T18:26:00,2011-03-01T18:19:00,C1.0,"POLYGON((16.96686 139.0158,16.96686 139.0158,1...",723.181677,-479.34480,541.498200,0,...,,,,0000000000000000000,0000000000000000000,0000000000000000000,0000000000000000000,0000000000000000000,0000000000000000000,0000000000000000000
3,SOL2011-03-01T18:51:00L165C066,2011-03-01T18:51:00,2011-03-01T18:58:00,2011-03-01T18:55:00,B9.8,"POLYGON((16.96674 139.0206,16.96674 139.0206,1...",604.061049,-346.73160,494.638200,1164,...,,,,0000000000000000000,0000000000000000000,0000000000000000000,0000000000000000000,0000000000000000000,0000000000000000000,0000000000000000000
4,SOL2011-03-02T10:02:00L164C066,2011-03-02T10:02:00,2011-03-02T10:10:00,2011-03-02T10:06:00,B8.8,"POLYGON((16.96404 139.1112,16.96404 139.1112,1...",550.064395,-229.68240,499.816800,1164,...,,,,0000000000000000000,0000000000000000000,0000000000000000000,0000000000000000000,0000000000000000000,0000000000000000000,0000000000000000000
5,SOL2011-03-02T13:11:00L163C066,2011-03-02T13:11:00,2011-03-02T13:21:00,2011-03-02T13:18:00,C1.4,"POLYGON((16.96344 139.128,16.96344 139.128,16....",544.430717,-214.68420,500.315400,1164,...,,,,0000000000000000000,0000000000000000000,SOL2011-03-02T13:15:32L165C067,SOL2011-03-02T13:15:26L165C067,SOL2011-03-02T13:13:45L165C067,SOL2011-03-02T13:15:29L165C067,0000000000000000000
6,SOL2011-03-03T11:29:00L086C083,2011-03-03T11:29:00,2011-03-03T11:43:00,2011-03-03T11:34:00,C1.2,"POLYGON((16.95948 139.2294,16.95948 139.2294,1...",953.656541,-943.30200,140.150400,1166,...,,,,SOL2011-03-03T11:29:35L079C084,0000000000000000000,0000000000000000000,0000000000000000000,0000000000000000000,0000000000000000000,0000000000000000000
7,SOL2011-03-03T14:12:00L163C068,2011-03-03T14:12:00,2011-03-03T14:20:00,2011-03-03T14:16:00,C5.4,"POLYGON((16.959 139.2396,16.959 139.2396,16.95...",474.685097,-15.71784,474.424800,1164,...,2011-03-03T14:34:03.000000000,-38.4,499.2,SOL2011-03-03T14:11:11L161C067,0000000000000000000,SOL2011-03-03T14:13:20L161C067,SOL2011-03-03T14:13:26L161C067,SOL2011-03-03T14:14:57L161C067,SOL2011-03-03T14:15:05L161C067,SOL2011-03-03T14:17:51L161C067
8,SOL2011-03-03T19:24:00L162C068,2011-03-03T19:24:00,2011-03-03T19:44:00,2011-03-03T19:36:00,C1.2,"POLYGON((16.9581 139.2576,16.9581 139.2576,16....",474.681471,15.71700,474.421200,1164,...,,,,SOL2011-03-03T19:28:47L163C067,0000000000000000000,0000000000000000000,0000000000000000000,0000000000000000000,0000000000000000000,0000000000000000000
9,SOL2011-03-03T23:24:00L165C069,2011-03-03T23:24:00,2011-03-03T23:46:00,2011-03-03T23:34:00,C2.0,"POLYGON((16.95738 139.2702,16.95738 139.2702,1...",468.638565,94.78140,458.953800,1164,...,,,,0000000000000000000,SOL2011-03-03T23:26:49L166C067,SOL2011-03-03T23:26:56L166C067,SOL2011-03-03T23:24:26L161C067,0000000000000000000,0000000000000000000,0000000000000000000


In [29]:
#choose most accurate location for flare (assumes AIA is more accurate)
goes = pd.read_csv('GOES_AIA4.csv', delimiter=',', header=0)
# goes = goes.rename(columns={'hpc_x':'hpc_x_goes', 'hpc_y':'hpc_y_goes'})
# length = goes.shape[0]
# nones = [None for i in range(length)]
# zeroes = [0 for i in range(length)]
# goes.loc[:, 'hpc_x'] = zeroes
# goes.loc[:, 'hpc_y'] = zeroes
# for i in range(length):
#     dic = {'wavelength': [94, 131, 171, 193, 211, 304, 335], 'peakflux':[goes['fl_peakflux_94'].values[i],
#                                                                          goes['fl_peakflux_131'].values[i],
#                                                                          goes['fl_peakflux_171'].values[i],
#                                                                          goes['fl_peakflux_193'].values[i],
#                                                                          goes['fl_peakflux_211'].values[i],
#                                                                          goes['fl_peakflux_304'].values[i],
#                                                                          goes['fl_peakflux_335'].values[i]]}
#     fluxes = pd.DataFrame(dic)
#     fluxes.sort_values(by='peakflux', ascending=False, inplace=True)
#     if fluxes['peakflux'].values[0]== 0:
#         goes['hpc_x'].values[i] = goes['hpc_x_goes'].values[i]
#         goes['hpc_y'].values[i] = goes['hpc_y_goes'].values[i]
#     else:
#         lmda = str(fluxes['wavelength'].values[0])
#         colX = 'hpc_x_' + lmda
#         colY = 'hpc_y_' + lmda
#         goes['hpc_x'].values[i] = goes[colX].values[i]
#         goes['hpc_y'].values[i] = goes[colY].values[i]
        
#add distance to center column
dist_frm_center = map(lambda x, y: find_dist_frm_center(x, y), goes['hpc_x'], goes['hpc_y'])
goes.loc[:, 'dist_frm_center'] = dist_frm_center

goes.to_csv('goes_aia4_moreinfo.csv', index = False, date_format = '%Y-%m-%dT%X')


In [8]:
#find the distance from center of disk from x, y coordinates (unit = arcsec)
def find_dist_frm_center(x, y):
    #set the radius of the Sun
    r = 966
    #calculate z squared
    z_sq = (r**2 - x**2 - y**2)
    #if loop to prevent imaginary numbers when taking square root of z squared
    if z_sq >= 0: z = z_sq**(0.5)
    else: z = (-z_sq)**(0.5)
    #calculate the distance to center, disk center @ (0, 0, R)
    dist = (x**2 + y**2 + (z-r)**2)**(0.5)
    return dist

Associate AR

In [9]:
#rotate polygon given for AR to flare event time
def rotate_ar2(polygon, start_time, end_time):
    poly_coords = list(polygon.exterior.coords)
    poly_coords = (np.asarray(poly_coords[0:-1])) *u.arcsec
    poly_coords_x = poly_coords[:,0]
    poly_coords_y = poly_coords[:,1]
    rotated_poly_coords_x, rotated_poly_coords_y = rot_hpc(poly_coords_x, poly_coords_y, start_time, end_time)
    x = rotated_poly_coords_x.value
    y = rotated_poly_coords_y.value
    rotated_ar_polygon = Polygon([(x[0],y[0]), (x[1],y[1]), (x[2],y[2]), (x[3],y[3])])
    return rotated_ar_polygon

In [27]:
def associate_ar(inputFile_fl, inputFile_ar, ar_folder, output2file=False, out_file = None):
    #import a record of flare events as a DataFrame
    flare_set = pd.read_csv(inputFile_fl, delimiter = ',', header = 0)
    #import a list of ar events as a DataFrame
    ar_list = pd.read_csv(inputFile_ar, delimiter = ',', header = 0)
    #how many flare events working with
    length = flare_set.shape[0]
    #list of zeroes with length of number of flare events
    zeroes = [0 for i in range(length)]
    zeroesL = ['0000000000000000000' for i in range(length)]
    #create columns (filled with zeroes) for tracking associated events
    flare_set.loc[:, 'associated_ar'] = zeroesL
    #get a list of ar keywords of relevance to flare events
    ar_keywords = list(np.genfromtxt('keywords_ar_append_fl.csv', delimiter=',', dtype=str))
    #create columns filled with 0s for each ar keyword
    for elem in ar_keywords:
         flare_set.loc[:, elem] = zeroes
    #convert start and end times to datetime objects
    flare_set['event_starttime'] = map(parse_time, flare_set['event_starttime'])
    flare_set['event_endtime'] = map(parse_time, flare_set['event_endtime'])
    ar_list['start_time'] = map(lambda x, y, z: datetime.datetime(x, y, z), ar_list['start_year'], 
                                ar_list['start_month'], ar_list['start_day'])
    ar_list['end_time'] = map(lambda x, y, z: datetime.datetime(x, y, z), ar_list['end_year'], 
                                  ar_list['end_month'], ar_list['end_day'])
    #set positional row index 
    i = -1

    for annoying_wrong_obj_type in flare_set['event_starttime']:
            i += 1
            #print which flare event function is currently processing, so the user has an idea of how much longer
            #program will need to run
            start = flare_set['event_starttime'].values[i]
            end = flare_set['event_endtime'].values[i]
            if (i+1)%25 == 0:
                print '%d / %d events' %((i+1), length)
            #begin eliminating ar events based on temporal parameters
            ar_search = ar_list.ix[ar_list['start_time']<=start]
            ar_search = ar_search.ix[ar_search['end_time']>=end]
            ar_files = list(ar_search['file_name'])
            #as long as the temporal search does not eliminate all possible related AR events, proceed
            if len(ar_files)!= 0:
                for f in ar_files:
                    fname = ar_folder+'/'+f
                    specific_ar = pd.read_csv(fname, delimiter = ',', header = 0)
                    specific_ar = specific_ar.rename(columns={'hpc_bbox': 'ar_hpc_bbox'})
                    found_time_match = False
                    specific_ar['event_starttime'] = map(parse_time, specific_ar['event_starttime'])
                    specific_ar['event_endtime'] = map(parse_time, specific_ar['event_endtime'])
                    j = 0
                    while found_time_match==False and j<len(specific_ar):
                        if (specific_ar['event_starttime'].values[j] <= start and 
                            specific_ar['event_endtime'].values[j] >= start):
                            found_time_match == True
                            fl_point = Point((flare_set['hpc_x'].values[i], flare_set['hpc_y'].values[i]))
                            ar_poly_og = wkt.loads(specific_ar['ar_hpc_bbox'].values[j])
                            ar_poly_og_points = list(ar_poly_og.exterior.coords)
                            not_on_limb = True
                            for elem in ar_poly_og_points:
                                if abs(elem[0]) > 800:
                                    not_on_limb = False
                            if not_on_limb:
                                end = specific_ar.loc[j, 'event_endtime']
                                ar_poly = rotate_ar2(ar_poly_og, annoying_wrong_obj_type, end)
                                if fl_point.intersects(ar_poly):
                                    flare_set['associated_ar'].values[i] = str(specific_ar.loc[j, 'SOL_standard'])
                                    for elem in ar_keywords:
                                        flare_set.loc[i, elem] = specific_ar[elem].values[j]
                        j+=1

    #create boolean var to easily determine whether flare associated with an AR
    k = 0
    is_ar = [0 for i in range(flare_set.shape[0])]
    for elem in flare_set['associated_ar']:
        if elem!='0000000000000000000':
            is_ar[k] = 1
        k+=1
    flare_set.loc[:, 'is_ar'] = is_ar
    
    #write dataframe to a csv file depending on initial parameters
    if output2file:
        if out_file == None:
            #create a generic name for file based on search parameters if no file name specified
             out_file = inputFile_fl[0:-4]+'_AR.csv'
        #write to csv
        flare_set.to_csv(path_or_buf=out_file, index = False, date_format = '%Y-%m-%dT%X')
        
    return flare_set

In [30]:
associate_ar('goes_aia4_moreinfo.csv', 'ar4_list.csv', 'AR4', output2file=True, out_file = None)

25 / 7311 events
50 / 7311 events
75 / 7311 events
100 / 7311 events
125 / 7311 events
150 / 7311 events
175 / 7311 events
200 / 7311 events
225 / 7311 events
250 / 7311 events
275 / 7311 events
300 / 7311 events
325 / 7311 events
350 / 7311 events
375 / 7311 events
400 / 7311 events
425 / 7311 events
450 / 7311 events
475 / 7311 events
500 / 7311 events
525 / 7311 events
550 / 7311 events
575 / 7311 events
600 / 7311 events
625 / 7311 events
650 / 7311 events
675 / 7311 events
700 / 7311 events
725 / 7311 events
750 / 7311 events
775 / 7311 events
800 / 7311 events
825 / 7311 events
850 / 7311 events
875 / 7311 events
900 / 7311 events
925 / 7311 events
950 / 7311 events
975 / 7311 events
1000 / 7311 events
1025 / 7311 events
1050 / 7311 events
1075 / 7311 events
1100 / 7311 events
1125 / 7311 events
1150 / 7311 events
1175 / 7311 events
1200 / 7311 events
1225 / 7311 events
1250 / 7311 events
1275 / 7311 events
1300 / 7311 events
1325 / 7311 events
1350 / 7311 events
1375 / 7311 even

Unnamed: 0,SOL_standard,event_starttime,event_endtime,event_peaktime,fl_goescls,hpc_bbox,hpc_radius,hpc_x,hpc_y,ar_noaanum,...,meanenergydensityunit,totalenergydensityunit,totalphotoenergy,totalphotoenergyunit,unsignedflux,magfluxunit,highsheararea,highshearareaunit,ar_hpc_bbox,is_ar
0,SOL2011-03-01T14:35:00L160C064,2011-03-01 14:35:00,2011-03-01 14:40:00,2011-03-01T14:38:00,C1.1,"POLYGON((16.96746 138.9924,16.96746 138.9924,1...",676.817633,-436.415400,517.323600,11164.0,...,ergs per cubic centimeter,ergs/cm,2.472987e+31,ergs,39021.238281,emx,3.096081e+11,km2,"POLYGON((-615.09 406.281,-297.186 430.29,-259....",1
1,SOL2011-03-01T17:42:00L167C066,2011-03-01 17:42:00,2011-03-01 17:54:00,2011-03-01T17:47:00,C1.3,"POLYGON((16.96692 139.0128,16.96692 139.0128,1...",589.308377,-318.034800,496.123200,11164.0,...,ergs per cubic centimeter,ergs/cm,2.639407e+31,ergs,41377.000000,emx,3.347602e+11,km2,"POLYGON((-590.6868 407.2458,-256.6296 430.1472...",1
2,SOL2011-03-01T18:06:00L154C062,2011-03-01 18:06:00,2011-03-01 18:26:00,2011-03-01T18:19:00,C1.0,"POLYGON((16.96686 139.0158,16.96686 139.0158,1...",723.181677,-479.344800,541.498200,11164.0,...,ergs per cubic centimeter,ergs/cm,2.639407e+31,ergs,41377.000000,emx,3.347602e+11,km2,"POLYGON((-590.6868 407.2458,-256.6296 430.1472...",1
3,SOL2011-03-01T18:51:00L165C066,2011-03-01 18:51:00,2011-03-01 18:58:00,2011-03-01T18:55:00,B9.8,"POLYGON((16.96674 139.0206,16.96674 139.0206,1...",604.061049,-346.731600,494.638200,11164.0,...,ergs per cubic centimeter,ergs/cm,2.639407e+31,ergs,41377.000000,emx,3.347602e+11,km2,"POLYGON((-590.6868 407.2458,-256.6296 430.1472...",1
4,SOL2011-03-02T10:02:00L164C066,2011-03-02 10:02:00,2011-03-02 10:10:00,2011-03-02T10:06:00,B8.8,"POLYGON((16.96404 139.1112,16.96404 139.1112,1...",550.064395,-229.682400,499.816800,11164.0,...,ergs per cubic centimeter,ergs/cm,2.711475e+31,ergs,38818.859375,emx,2.955192e+11,km2,"POLYGON((-488.3472 408.9054,-110.9496 426.0066...",1
5,SOL2011-03-02T13:11:00L163C066,2011-03-02 13:11:00,2011-03-02 13:21:00,2011-03-02T13:18:00,C1.4,"POLYGON((16.96344 139.128,16.96344 139.128,16....",544.430717,-214.684200,500.315400,11164.0,...,ergs per cubic centimeter,ergs/cm,2.714137e+31,ergs,40020.449219,emx,3.519970e+11,km2,"POLYGON((-459.198 412.0074,-75.0042 427.3422,-...",1
6,SOL2011-03-03T11:29:00L086C083,2011-03-03 11:29:00,2011-03-03 11:43:00,2011-03-03T11:34:00,C1.2,"POLYGON((16.95948 139.2294,16.95948 139.2294,1...",953.656541,-943.302000,140.150400,0.0,...,0,0,0.000000e+00,0,0.000000,0,0.000000e+00,0,0,0
7,SOL2011-03-03T14:12:00L163C068,2011-03-03 14:12:00,2011-03-03 14:20:00,2011-03-03T14:16:00,C5.4,"POLYGON((16.959 139.2396,16.959 139.2396,16.95...",474.685097,-15.717840,474.424800,11164.0,...,ergs per cubic centimeter,ergs/cm,2.685431e+31,ergs,38691.890625,emx,3.773270e+11,km2,"POLYGON((-274.224 405.1992,140.8044 409.1406,1...",1
8,SOL2011-03-03T19:24:00L162C068,2011-03-03 19:24:00,2011-03-03 19:44:00,2011-03-03T19:36:00,C1.2,"POLYGON((16.9581 139.2576,16.9581 139.2576,16....",474.681471,15.717000,474.421200,11164.0,...,ergs per cubic centimeter,ergs/cm,2.641952e+31,ergs,37997.250000,emx,3.658070e+11,km2,"POLYGON((-241.8912 406.1856,175.455 408.1554,1...",1
9,SOL2011-03-03T23:24:00L165C069,2011-03-03 23:24:00,2011-03-03 23:46:00,2011-03-03T23:34:00,C2.0,"POLYGON((16.95738 139.2702,16.95738 139.2702,1...",468.638565,94.781400,458.953800,11164.0,...,ergs per cubic centimeter,ergs/cm,2.713040e+31,ergs,40701.308594,emx,3.789133e+11,km2,"POLYGON((-208.968 402.474,209.805 402.4488,176...",1


Associate Sigmoid

In [111]:
def associate_sigmoid(inputFile_fl, inputFile_sigmoid, spatial_sep, output2file=False, out_file = None):
    #import a record of flare events as a DataFrame
    flare_set = pd.read_csv(inputFile_fl, delimiter = ',', header = 0)
    #import a record of sigmoid events as a DataFrame
    sigmoid_set = pd.read_csv(inputFile_sigmoid, delimiter = ',', header = 0)
    #set solar radius
    r = 966
    #how many flare events working with
    length = flare_set.shape[0]
    #list of zeroes with length of number of flare events
    zeroes = [0 for i in range(length)]
    #list of nulls with length of number of flare events
    nones = [None for i in range(length)]
    #create columns (filled with zeroes) for tracking associated events
    flare_set.loc[:, 'associated_sigmoid'] = nones
    flare_set.loc[:, 'num_associated_sigmoid'] = zeroes
    flare_set.loc[:, 'dist_sigmoid'] = zeroes

    sigmoid_set = sigmoid_set.rename(columns={'event_starttime':'starttime_sigmoid', 'event_endtime':'endtime_sigmoid',
                                              'hpc_bbox':'hpc_bbox_sigmoid', 'ar_noaanum': 'sigmoid_noaanum',
                                             'search_channelid': 'search_channelid_sigmoid'})
    sigmoid_keywords = ['starttime_sigmoid', 'endtime_sigmoid','hpc_bbox_sigmoid', 'sigmoid_noaanum', 
                        'search_channelid_sigmoid']
   
    #create columns filled with Nones for each sigmoid keyword
    for elem in sigmoid_keywords:
         flare_set.loc[:, elem] = nones
    #convert start and end times to datetime objects
    flare_set['event_starttime'] = map(parse_time, flare_set['event_starttime'])
    flare_set['event_endtime'] = map(parse_time, flare_set['event_endtime'])
    sigmoid_set['starttime_sigmoid'] = map(parse_time, sigmoid_set['starttime_sigmoid'])
    sigmoid_set['endtime_sigmoid'] = map(parse_time, sigmoid_set['endtime_sigmoid'])

    #set positional row index 
    i = -1

    for annoying_wrong_obj_type in flare_set['event_starttime']:
        i += 1
        #print which flare event function is currently processing, so the user has an idea of how much longer
        #program will need to run
        start = flare_set['event_starttime'].values[i]
        end = flare_set['event_endtime'].values[i]
        if (i+1)%100 == 0:
            print '%d / %d events' %((i+1), length)
        #begin eliminating sigmoid events based on temporal parameters
        sigmoid_search = sigmoid_set.ix[sigmoid_set['starttime_sigmoid']<=start]
        sigmoid_search = sigmoid_search.ix[sigmoid_search['endtime_sigmoid']>=end]
        num_associated_sigmoid=0
        #as long as the temporal search does not eliminate all possible related sigmoid events, proceed
        if sigmoid_search.empty==False:
                fl_point = Point((flare_set['hpc_x'].values[i], flare_set['hpc_y'].values[i]))
                min_s = spatial_sep
                event_index = None
                found_sigmoid = False
                for j in range(sigmoid_search.shape[0]):
                    #create a Shapely polygon object from sigmoid's location
                    sigmoid_poly = wkt.loads(sigmoid_search['hpc_bbox_sigmoid'].values[0])
                    #calculate the minimum 2D distance between the sigmoid's polygon and the flare's mean coordinates
                    chord = fl_point.distance(sigmoid_poly)
                    #calculate the minimum 3D distance along the sun's curved surface between the flare and sigmoid
                    #assumes the same radius for all events
                    s = r*np.arcsin(chord/r)
                    #determine whether the spatial distance between flare and sigmoid meets the set parameter
                    if s <= spatial_sep:
                        #have found an associated sigmoid
                        found_sigmoid = True
                        num_associated_sigmoid+=1 
                        if s <= min_s: 
                            min_s = s
                            event_index = j
                if found_sigmoid:
                    flare_set['associated_sigmoid'].values[i] = sigmoid_search['SOL_standard'].values[event_index]
                    flare_set['num_associated_sigmoid'].values[i] = num_associated_sigmoid
                    flare_set['dist_sigmoid'].values[i] = min_s
                    for elem in sigmoid_keywords:
                        flare_set.loc[i, elem] = sigmoid_search[elem].values[event_index]
            
    #create boolean var to easily determine whether flare associated with an AR
    k = 0
    is_sigmoid = [0 for i in range(flare_set.shape[0])]
    for elem in flare_set['associated_sigmoid']:
        if elem!=None:
            is_sigmoid[k] = 1
        k+=1
    flare_set.loc[:, 'is_sigmoid'] = is_sigmoid
    
    #write dataframe to a csv file depending on initial parameters
    if output2file:
        if out_file == None:
            #create a generic name for file based on search parameters if no file name specified
             out_file = inputFile_fl[0:-4]+'_sigmoid.csv'
        #write to csv
        flare_set.to_csv(path_or_buf=out_file, index = False, date_format = '%Y-%m-%dT%X')
        
    return flare_set
    

In [112]:
associate_sigmoid('goes_aia7_AR.csv', 'raw_sg.csv', 100, output2file=True, out_file = None)

100 / 7493 events
200 / 7493 events
300 / 7493 events
400 / 7493 events
500 / 7493 events
600 / 7493 events
700 / 7493 events
800 / 7493 events
900 / 7493 events
1000 / 7493 events
1100 / 7493 events
1200 / 7493 events
1300 / 7493 events
1400 / 7493 events
1500 / 7493 events
1600 / 7493 events
1700 / 7493 events
1800 / 7493 events
1900 / 7493 events
2000 / 7493 events
2100 / 7493 events
2200 / 7493 events
2300 / 7493 events
2400 / 7493 events
2500 / 7493 events
2600 / 7493 events
2700 / 7493 events
2800 / 7493 events
2900 / 7493 events
3000 / 7493 events
3100 / 7493 events
3200 / 7493 events
3300 / 7493 events
3400 / 7493 events
3500 / 7493 events
3600 / 7493 events
3700 / 7493 events
3800 / 7493 events
3900 / 7493 events
4000 / 7493 events
4100 / 7493 events
4200 / 7493 events
4300 / 7493 events
4400 / 7493 events
4500 / 7493 events
4600 / 7493 events
4700 / 7493 events
4800 / 7493 events
4900 / 7493 events
5000 / 7493 events
5100 / 7493 events
5200 / 7493 events
5300 / 7493 events
54

Unnamed: 0,SOL_standard,event_starttime,event_endtime,event_peaktime,fl_goescls,hpc_bbox,hpc_radius,hpc_x,hpc_y,ar_noaanum,...,is_ar,associated_sigmoid,num_associated_sigmoid,dist_sigmoid,starttime_sigmoid,endtime_sigmoid,hpc_bbox_sigmoid,sigmoid_noaanum,search_channelid_sigmoid,is_sigmoid
0,SOL2011-03-01T14:35:00L160C064,2011-03-01 14:35:00,2011-03-01 14:40:00,2011-03-01T14:38:00,C1.1,"POLYGON((16.96746 138.9924,16.96746 138.9924,1...",676.817633,-436.41540,517.323600,11164.0,...,1,,0,0,,,,,,0
1,SOL2011-03-01T17:42:00L167C066,2011-03-01 17:42:00,2011-03-01 17:54:00,2011-03-01T17:47:00,C1.3,"POLYGON((16.96692 139.0128,16.96692 139.0128,1...",589.308377,-318.03480,496.123200,11164.0,...,1,,0,0,,,,,,0
2,SOL2011-03-01T18:06:00L154C062,2011-03-01 18:06:00,2011-03-01 18:26:00,2011-03-01T18:19:00,C1.0,"POLYGON((16.96686 139.0158,16.96686 139.0158,1...",723.181677,-479.34480,541.498200,11164.0,...,1,,0,0,,,,,,0
3,SOL2011-03-01T18:51:00L165C066,2011-03-01 18:51:00,2011-03-01 18:58:00,2011-03-01T18:55:00,B9.8,"POLYGON((16.96674 139.0206,16.96674 139.0206,1...",604.061049,-346.73160,494.638200,11164.0,...,1,,0,0,,,,,,0
4,SOL2011-03-02T10:02:00L164C066,2011-03-02 10:02:00,2011-03-02 10:10:00,2011-03-02T10:06:00,B8.8,"POLYGON((16.96404 139.1112,16.96404 139.1112,1...",550.064395,-229.68240,499.816800,11164.0,...,1,,0,0,,,,,,0
5,SOL2011-03-02T13:11:00L163C066,2011-03-02 13:11:00,2011-03-02 13:21:00,2011-03-02T13:18:00,C1.4,"POLYGON((16.96344 139.128,16.96344 139.128,16....",544.430717,-214.68420,500.315400,11164.0,...,1,,0,0,,,,,,0
6,SOL2011-03-03T11:29:00L086C083,2011-03-03 11:29:00,2011-03-03 11:43:00,2011-03-03T11:34:00,C1.2,"POLYGON((16.95948 139.2294,16.95948 139.2294,1...",953.656541,-943.30200,140.150400,0.0,...,0,,0,0,,,,,,0
7,SOL2011-03-03T14:12:00L163C068,2011-03-03 14:12:00,2011-03-03 14:20:00,2011-03-03T14:16:00,C5.4,"POLYGON((16.959 139.2396,16.959 139.2396,16.95...",474.685097,-15.71784,474.424800,11164.0,...,1,,0,0,,,,,,0
8,SOL2011-03-03T19:24:00L162C068,2011-03-03 19:24:00,2011-03-03 19:44:00,2011-03-03T19:36:00,C1.2,"POLYGON((16.9581 139.2576,16.9581 139.2576,16....",474.681471,15.71700,474.421200,11164.0,...,1,,0,0,,,,,,0
9,SOL2011-03-03T23:24:00L165C069,2011-03-03 23:24:00,2011-03-03 23:46:00,2011-03-03T23:34:00,C2.0,"POLYGON((16.95738 139.2702,16.95738 139.2702,1...",468.638565,94.78140,458.953800,11164.0,...,1,,0,0,,,,,,0


Associate Eruption

In [87]:
def associate_er(inputFile_fl, inputFile_er, temporal_sep_hr, spatial_sep, output2file=False, out_file = None):
    
    #import a record of flare events as a DataFrame
    flare_set = pd.read_csv(inputFile_fl, delimiter = ',', header = 0)
    #import a record of eruption events as a DataFrame
    er_set = pd.read_csv(inputFile_er, delimiter = ',', header = 0)
    
    #set solar radius
    r = 966
    
    #how many flare events working with
    length = flare_set.shape[0]
    #list of zeroes with length of number of flare events
    zeroes = [0 for i in range(length)]
    nones = [None for i in range(length)]
    #create columns (filled with zeroes/nulls) for tracking associated events and number of associated events
    flare_set.loc[:, 'associated_er'] = nones
    flare_set.loc[:, 'num_associated_er'] = zeroes
    flare_set.loc[:, 'dist_er'] = zeroes
    
    #get a list of eruption keywords of relevance to flare events
    er_keywords = list(np.genfromtxt('keywords_eruption.csv', delimiter=',', dtype=str))
    #rename eruption keywords already employeed by flare database
    er_set = er_set.rename(columns={'event_starttime':'starttime_er', 'event_endtime':'endtime_er',
                                    'hpc_bbox':'hpc_bbox_er', 'area_raw': 'area_raw_er', 
                                    'area_unit':'area_unit_er', 'event_description':'event_description_er',
                                    'frm_name':'frm_name_er', 'obs_channelid': 'obs_channelid_er',
                                    'hpc_x':'hpc_x_er', 'hpc_y':'hpc_y_er'})
    
    #create columns filled with zeroes for each eruption keyword
    for elem in er_keywords:
        flare_set.loc[:, elem] = nones

    #convert start, peak, and end times and delta_t to datetime objects
    flare_set['adjusted_starttime'] = map(parse_time, flare_set['adjusted_starttime'])
    er_set['starttime_er'] = map(parse_time, er_set['starttime_er'])
    delta_t = datetime.timedelta(hours = temporal_sep_hr)
    
    #set positional row index 
    i = -1
    
    #iterate through flare start times
    for elem in flare_set['adjusted_starttime']:
        
        #add one to our row position index
        i += 1
        #create a counter for the number of associated ef events for a particular flare
        num_associated_er = 0
        
        #print which flare event function is currently processing, so the user has an idea of how much longer
        #program will need to run (print every 100 flare events so as to not overwhelm the kernel)
        if (i+1)%100 == 0:
            print '%d / %d events' %((i+1), length)
        
        #calculate what time the latest associated eruption could have occured
        start_er = elem+delta_t
        #begin eliminating ef events based on temporal parameters—eruption events must began after the beginning
        #of the flare and before the temporal cutoff given via input parameters 
        er_search = er_set.ix[er_set['starttime_er']>=elem]
        er_search = er_search.ix[er_search['starttime_er']<=start_er]
        
        #as long as the temporal search does not eliminate all possible related eruption events, proceed
        if er_search.empty == False:
            #create a shapely point object from the flare's mean coordinates
            fl_point = Point((flare_set['hpc_x'].values[i], flare_set['hpc_y'].values[i]))
            
            #
            min_s = spatial_sep
            event_index = None
            found_er = False
            #iterate through the properly timed eruption events to see whether they are spatially relevant
            for j in range(er_search.shape[0]):
                er_poly = wkt.loads(er_search['hpc_bbox_er'].values[j])
                #calculate the minimum 2D distance between the eruption's polygon and the flare's mean coordinate
                chord = fl_point.distance(er_poly)
                #calculate the minimum 3D distance along the sun's curved surface between the eruption & flare events
                #assumes the same radius for all events
                s = r*np.arcsin(chord/r)
                #determine whether the spatial distance between eruption and flare meets the set parameter
                if s <= spatial_sep:
                    #have found an associated eruption
                    found_er = True
                    num_associated_er+=1 
                    #determine whether this is closest eruption, (currently) only recording, of the temporally relevant 
                    #events, the one closest (spatially) to the flare
                    if s <= min_s: 
                        min_s = s
                        event_index = j
                #if found an associated event, record it in flares dataset
                if found_er:
                    flare_set['associated_er'].values[i] = er_search['SOL_standard'].values[event_index]
                    flare_set['num_associated_er'].values[i] = num_associated_er
                    flare_set['dist_er'].values[i] = min_s
                    for elem in er_keywords:
                        flare_set.loc[i, elem] = er_search[elem].values[event_index]
                
    #create boolean var to easily determine whether flare associated with an eruption
    k = 0
    is_er = zeroes
    for elem in flare_set['associated_er']:
        if elem!=None:
            is_er[k] = 1
        k+=1
    flare_set.loc[:, 'is_er'] = is_er
    
    #write dataframe to a csv file depending on initial parameters
    if output2file == True:
        if out_file == None:
            #create a generic name for file if no file name specified
            out_file = inputFile_fl[0:-4]+'_er.csv'
        #write to csv
        flare_set.to_csv(path_or_buf=out_file, index = False,date_format = '%Y-%m-%dT%X')
        
    return flare_set 

In [88]:
associate_er('goes_aia7_AR_sigmoid.csv', 'raw_er.csv', 1, 100, output2file=True, out_file = 'goes_aia8_AR_sigmoid_er.csv')

  if self.run_code(code, result):


100 / 7493 events
200 / 7493 events
300 / 7493 events
400 / 7493 events
500 / 7493 events
600 / 7493 events
700 / 7493 events
800 / 7493 events
900 / 7493 events
1000 / 7493 events
1100 / 7493 events
1200 / 7493 events
1300 / 7493 events
1400 / 7493 events
1500 / 7493 events
1600 / 7493 events
1700 / 7493 events
1800 / 7493 events
1900 / 7493 events
2000 / 7493 events
2100 / 7493 events
2200 / 7493 events
2300 / 7493 events
2400 / 7493 events
2500 / 7493 events
2600 / 7493 events
2700 / 7493 events
2800 / 7493 events
2900 / 7493 events
3000 / 7493 events
3100 / 7493 events
3200 / 7493 events
3300 / 7493 events
3400 / 7493 events
3500 / 7493 events
3600 / 7493 events
3700 / 7493 events
3800 / 7493 events
3900 / 7493 events
4000 / 7493 events
4100 / 7493 events
4200 / 7493 events
4300 / 7493 events
4400 / 7493 events
4500 / 7493 events
4600 / 7493 events
4700 / 7493 events
4800 / 7493 events
4900 / 7493 events
5000 / 7493 events
5100 / 7493 events
5200 / 7493 events
5300 / 7493 events
54

Unnamed: 0,SOL_standard,event_starttime,event_endtime,event_peaktime,fl_goescls,hpc_bbox,hpc_radius,hpc_x,hpc_y,ar_noaanum,...,intensunit,obs_channelid_er,outflow_length,outflow_lengthunit,outflow_speed,outflow_speedunit,outflow_transspeed,outflow_width,outflow_widthunit,is_er
0,SOL2011-03-01T14:35:00L160C064,2011-03-01T14:35:00,2011-03-01T14:40:00,2011-03-01T14:38:00,C1.1,"POLYGON((16.96746 138.9924,16.96746 138.9924,1...",676.817633,-436.41540,517.323600,11164.0,...,,,,,,,,,,0
1,SOL2011-03-01T17:42:00L167C066,2011-03-01T17:42:00,2011-03-01T17:54:00,2011-03-01T17:47:00,C1.3,"POLYGON((16.96692 139.0128,16.96692 139.0128,1...",589.308377,-318.03480,496.123200,11164.0,...,,,,,,,,,,0
2,SOL2011-03-01T18:06:00L154C062,2011-03-01T18:06:00,2011-03-01T18:26:00,2011-03-01T18:19:00,C1.0,"POLYGON((16.96686 139.0158,16.96686 139.0158,1...",723.181677,-479.34480,541.498200,11164.0,...,,,,,,,,,,0
3,SOL2011-03-01T18:51:00L165C066,2011-03-01T18:51:00,2011-03-01T18:58:00,2011-03-01T18:55:00,B9.8,"POLYGON((16.96674 139.0206,16.96674 139.0206,1...",604.061049,-346.73160,494.638200,11164.0,...,,,,,,,,,,0
4,SOL2011-03-02T10:02:00L164C066,2011-03-02T10:02:00,2011-03-02T10:10:00,2011-03-02T10:06:00,B8.8,"POLYGON((16.96404 139.1112,16.96404 139.1112,1...",550.064395,-229.68240,499.816800,11164.0,...,km/sec,304,,,,,,,,1
5,SOL2011-03-02T13:11:00L163C066,2011-03-02T13:11:00,2011-03-02T13:21:00,2011-03-02T13:18:00,C1.4,"POLYGON((16.96344 139.128,16.96344 139.128,16....",544.430717,-214.68420,500.315400,11164.0,...,,,,,,,,,,0
6,SOL2011-03-03T11:29:00L086C083,2011-03-03T11:29:00,2011-03-03T11:43:00,2011-03-03T11:34:00,C1.2,"POLYGON((16.95948 139.2294,16.95948 139.2294,1...",953.656541,-943.30200,140.150400,0.0,...,,,,,,,,,,0
7,SOL2011-03-03T14:12:00L163C068,2011-03-03T14:12:00,2011-03-03T14:20:00,2011-03-03T14:16:00,C5.4,"POLYGON((16.959 139.2396,16.959 139.2396,16.95...",474.685097,-15.71784,474.424800,11164.0,...,km/s,304,4.61821e+09,cm,3.47143,km/s,5.43724,6.4131e+08,cm,1
8,SOL2011-03-03T19:24:00L162C068,2011-03-03T19:24:00,2011-03-03T19:44:00,2011-03-03T19:36:00,C1.2,"POLYGON((16.9581 139.2576,16.9581 139.2576,16....",474.681471,15.71700,474.421200,11164.0,...,km/s,304,1.05057e+10,cm,2.09741,km/s,1.42757,2.86124e+09,cm,1
9,SOL2011-03-03T23:24:00L165C069,2011-03-03T23:24:00,2011-03-03T23:46:00,2011-03-03T23:34:00,C2.0,"POLYGON((16.95738 139.2702,16.95738 139.2702,1...",468.638565,94.78140,458.953800,11164.0,...,,,,,,,,,,0


Associate EF

In [89]:
def associate_ef_goes(inputFile_fl, inputFile_ef, begin_year, begin_month, begin_day, end_year, end_month, end_day, 
                 max_dist_frm_center, min_max_b_strength, min_ef_association_strength, temporal_sep_hr, spatial_sep, 
                  output2file=False, out_file = None):
    #covert times to datetime objects
    delta_t = datetime.timedelta(hours = temporal_sep_hr)
    begin_time = datetime.datetime(begin_year, begin_month, begin_day)
    end_time = datetime.datetime(end_year, end_month, end_day) 
    
    #pull appropiate flare events given input parameters
    flare_set = pull_sample_flare_2(inputFile_fl, begin_time, end_time, max_dist_frm_center)
    #pull potentially related ef events given input parameters
    ef_set = pull_sample_ef1(inputFile_ef, begin_time, end_time, delta_t, min_max_b_strength)
    #set the radius of the sun in arc secs
    r = 966
    #how many flare events working with after cull by intial spatial, temporal, and size paramters
    length = flare_set.shape[0]
    #list of zeroes with length of number of flare events
    zeroes = [0 for i in range(length)]
    float_zeroes = [0.0 for i in range(length)]
    nones = [None for i in range(length)]
    #create columns (filled with zeroes) for tracking associated events and number of associated events
    flare_set.loc[:, 'associated_ef'] = nones
    flare_set.loc[:, 'num_associated_ef'] = zeroes
    flare_set.loc[:, 'ef_association_strength'] = float_zeroes
    flare_set.loc[:, 'num_assoc_ef_over_assoc_strength'] = zeroes
    flare_set.loc[:, 'dist_ef'] = float_zeroes
    flare_set.loc[:, 'temporal_sep_ef'] = float_zeroes
    flare_set.loc[:, 'duration_ef'] = zeroes
    #get a list of ef keywords of relevance to flare events
    ef_keywords = list(np.genfromtxt('keywords_ef_append_fl.csv', delimiter=',', dtype=str))
    #create columns filled with zeroes for each ef keyword
    for elem in ef_keywords:
        flare_set.loc[:, elem] = nones
    #create an empty list to fill with row indexes of duplicated flare events to delete at end of function
    to_be_deleted = []
    #get indexes in list 
    idx = flare_set['adjusted_starttime'].index.tolist()
    #last row index
    idx_last = idx[-1]
    #convert start, peak, and end times to datetime objects
    flare_set['adjusted_starttime'] = map(parse_time, flare_set['adjusted_starttime'])
    #set positional row index 
    i = -1
    for elem in flare_set['adjusted_starttime']:
        i += 1
        #print which flare event function is currently processing, so the user has an idea of how much longer
        #program will need to run
        if (i+1)%25 == 0:
            print '%d / %d events' %((i+1), length)
        #calculate what time the earliest associated ef could have occured
        start_ef = elem-delta_t
        #begin eliminating ef events based on temporal parameters—ef events must end after the earliest possible 
        #event start time and end before the flare event begins
        ef_search = ef_set.ix[ef_set['ef_endtime']>=start_ef]
        ef_search = ef_search.ix[ef_search['ef_endtime']<=elem]
        #create a counter for the number of associated ef events for a particular flare
        num_associated_ef = 0
        num_assoc_ef_over_assoc_strength = 0
        #as long as the temporal search does not eliminate all possible related ef events, proceed
        if ef_search.empty == False:
            #create a shapely point object from the flare's mean coordinates
            #keep in mind, Point in HPC units 
            fl_point = Point((flare_set['hpc_x'].values[i], flare_set['hpc_y'].values[i]))
            #run this loop if temporal qualifications have limited ef_search to only one ef event
            if ef_search.shape[0] == 1:
                #create a shapely polygon object from ef's location
                ef_poly = wkt.loads(ef_search['ef_hpc_bbox'].values[0])
                #calculate the minimum 2D distance between the ef's polygon and the flare's mean coordinate
                chord = fl_point.distance(ef_poly)
                #calculate the minimum 3D distance along the sun's curved surface between the ef & flare events
                #assumes the same radius for all events
                s = r*np.arcsin(chord/r)
                #determine whether the spatial distance between ef and flare meets the set parameter
                if s <= spatial_sep:
                    #have found an associated ef
                    one_associated_ef(i, 0, flare_set, ef_search, s, ef_keywords, idx, min_ef_association_strength)
            #run this loop if temporal qualifications have limited ef_search to a list of possible ef
            #works in the same manner as the loop for a single event except each element of the list is
            #run through consecutively  
            else:
                ef_search['ef_hpc_bbox'] = unicode2polygon(ef_search['ef_hpc_bbox'])
                #create a dummy counter variable because python won't let me use enumerate on a dataframe column
                j = 0
                #create an empty list to add found event indices to for later access
                listOindices = []
                listOs = []
                for elem in ef_search['ef_hpc_bbox'].values:
                    chord = fl_point.distance(elem)
                    s = r*np.arcsin(chord/r)
                    if s <= spatial_sep:
                        num_associated_ef+=1 
                        listOindices.append(j)
                        listOs.append(s)
                    j+=1
                #run if we've found at least one associted ef event
                if num_associated_ef == 1:
                    one_associated_ef(i, listOindices[0], flare_set, ef_search, s, ef_keywords, idx, min_ef_association_strength)
                    flare_set['num_associated_ef'].values[i] = num_associated_ef
                elif num_associated_ef > 1:
                    flare_set['num_associated_ef'].values[i] = num_associated_ef
                    highest_association_strength = 0
                    highest_association_index = 0
                    for counter, index in enumerate(listOindices):
                        if listOs[counter]==0: listOs[counter]=1
                        temporal_sep = pd.to_timedelta(flare_set['adjusted_starttime'].values[i]-ef_search['ef_endtime'].values[index])
                        temporal_sep = (temporal_sep.total_seconds())/60
                        ef_association_strength = (((ef_search['maxmagfieldstrength'].values[index])**2)/
                                                   (listOs[counter]*temporal_sep))
                        if ef_association_strength >= min_ef_association_strength:
                            num_assoc_ef_over_assoc_strength+=1
                        if ef_association_strength >= highest_association_strength:
                            highest_association_index = index
                            highest_association_strength = ef_association_strength
                            s_with_highest_asso_strength = listOs[counter]
                            temp_sep_with_highest_asso_strength = temporal_sep
                    flare_set['num_assoc_ef_over_assoc_strength'].values[i] = num_assoc_ef_over_assoc_strength
                    flare_set['dist_ef'].values[i] = s_with_highest_asso_strength
                    flare_set['temporal_sep_ef'].values[i] = temp_sep_with_highest_asso_strength*60
                    flare_set['ef_association_strength'].values[i] = highest_association_strength
                    if highest_association_strength >= min_ef_association_strength:
                        duration = pd.to_timedelta(ef_search['ef_endtime'].values[highest_association_index]-
                                                   ef_search['ef_starttime'].values[highest_association_index])
                        duration = duration.total_seconds()
                        flare_set['associated_ef'].values[i] = ef_search['ef_SOL_standard'].values[highest_association_index]
                        flare_set['duration_ef'].values[i] = duration
                        for elem in ef_keywords:
                            flare_set.loc[idx[i], elem] = ef_search[elem].values[highest_association_index]
                    else: 
                        flare_set['maxmagfieldstrength'].values[i] = ef_set['maxmagfieldstrength'].values[highest_association_index]
                    
    #create boolean var to easily determine whether flare associated with an ef
    k = 0
    is_ef = [0 for i in range(flare_set.shape[0])]
    for elem in flare_set['associated_ef']:
        if elem!=None:
            is_ef[k] = 1
        k+=1
    flare_set.loc[:, 'is_ef'] = is_ef
    
    #write dataframe to a csv file depending on initial parameters
    if output2file == True:
        if out_file == None:
            #create a generic name for file based on search parameters if no file name specified
             out_file = ('flare_search_starting_w_goes_'+str(begin_time)[0:10]+'_'+ str(end_time)[0:10]+'_'+str(temporal_sep_hr)+'_'+
                         str(max_dist_frm_center)+'_'+str(min_max_b_strength)+'.csv')
        #write to csv
        flare_set.to_csv(out_file, index = False, date_format = '%Y-%m-%dT%X')
        
    return flare_set 

In [90]:
def pull_sample_flare_2(inputFile, begin_time, end_time, max_dist_frm_center):
    r=966
    flare_set = pd.read_csv(inputFile, delimiter = ',', header = 0)
    flare_set['adjusted_starttime'] = map(parse_time, flare_set['event_starttime'])
    flare_set = flare_set.ix[flare_set['adjusted_starttime']>=begin_time]
    s = r*np.arcsin(r/r)
    flare_set = flare_set.ix[(flare_set['dist_frm_center']/s)<=(0.01*max_dist_frm_center)]
    print flare_set.shape[0]
    return flare_set

In [91]:
def pull_sample_ef(inputFile, begin_time, end_time, delta_t, min_max_b_strength):
    ef_set = pd.read_csv(inputFile, delimiter = ',', header = 0)
    ef_set['event_starttime'] = map(parse_time, ef_set['event_starttime'])
    ef_set['event_endtime'] = map(parse_time, ef_set['event_endtime'])
    ef_set = ef_set.ix[ef_set['maxmagfieldstrength']>=min_max_b_strength]
    ef_set = ef_set.ix[ef_set['event_starttime']>=(begin_time-delta_t)]
    ef_set = ef_set.ix[ef_set['event_endtime']<=(end_time-delta_t)]
    ef_set = ef_set.rename(columns={'event_starttime': 'ef_starttime', 'event_endtime': 'ef_endtime',
                                    'hpc_bbox': 'ef_hpc_bbox', 'hpc_coord': 'ef_hpc_coord',
                                    'hpc_radius': 'ef_hpc_radius','hpc_x' : 'ef_hpc_x','hpc_y': 'ef_hpc_y', 
                                    'SOL_standard' : 'ef_SOL_standard'})
    return ef_set

In [96]:
def pull_sample_ef1(inputFile, begin_time, end_time, delta_t, min_max_b_strength):
    ef_set = pd.read_csv(inputFile, delimiter = ',', header = 0)
    ef_set['event_starttime'] = map(parse_time, ef_set['event_starttime'])
    ef_set['event_endtime'] = map(parse_time, ef_set['event_endtime'])
    ef_set = ef_set.ix[ef_set['maxmagfieldstrength']>=min_max_b_strength]
    ef_set = ef_set.ix[ef_set['event_starttime']>=(begin_time-delta_t)]
    ef_set = ef_set.ix[ef_set['event_endtime']<=(end_time-delta_t)]
#     ef_set = ef_set.ix[ef_set['event_probability']==1]
    ef_set = ef_set.rename(columns={'event_starttime': 'ef_starttime', 'event_endtime': 'ef_endtime',
                                    'hpc_bbox': 'ef_hpc_bbox', 'hpc_coord': 'ef_hpc_coord',
                                    'hpc_radius': 'ef_hpc_radius','hpc_x' : 'ef_hpc_x','hpc_y': 'ef_hpc_y', 
                                    'SOL_standard' : 'ef_SOL_standard'})
    return ef_set

In [93]:
def one_associated_ef(flare_index, ef_index, fl_set, ef_set, s, ef_keywords, idx, min_ef_association_strength):
    #calculate the association strength of the ef to the flare
    #if flare is contained in ef, set the seperation distance to 1 so as to not divide by 0
    if s==0: s=1
    temporal_sep = pd.to_timedelta(fl_set['adjusted_starttime'].values[flare_index]-ef_set['ef_endtime'].values[ef_index])
    temporal_sep = (temporal_sep.total_seconds())/60
    ef_association_strength = (((ef_set['maxmagfieldstrength'].values[ef_index])**2)
                                                                /(s*temporal_sep))
    #enter into the flare dataset information about its associated ef
    fl_set['ef_association_strength'].values[flare_index] = ef_association_strength
    fl_set['temporal_sep_ef'].values[flare_index] = temporal_sep*60
    fl_set['dist_ef'].values[flare_index] = s
    if ef_association_strength>=min_ef_association_strength:
        duration = pd.to_timedelta(ef_set['ef_endtime'].values[ef_index]-ef_set['ef_starttime'].values[ef_index])
        duration = duration.total_seconds()
        fl_set['associated_ef'].values[flare_index] = ef_set['ef_SOL_standard'].values[ef_index]
        for elem in ef_keywords:
            fl_set.loc[idx[flare_index], elem] = ef_set[elem].values[ef_index]
        #set the number of associated ef events to 1
        fl_set['num_associated_ef'].values[flare_index] = 1 
        fl_set['num_assoc_ef_over_assoc_strength'].values[flare_index] = 1 
    else: 
        fl_set['maxmagfieldstrength'].values[flare_index] = ef_set['maxmagfieldstrength'].values[ef_index]
    

In [94]:
def unicode2polygon(bbox_array):
    if isinstance(bbox_array, basestring):
        bbox_array = wkt.loads(bbox_array)
    else:
        bbox_array = map(lambda x: wkt.loads(x), bbox_array)
#         for i, elem in bbox_array:
#             bbox_array[i] = wkt.loads(elem)
    return bbox_array

In [97]:
associate_ef_goes('goes_aia8_AR_sigmoid_er.csv', 'raw_ef1.csv', 2011, 3, 1, 2016, 6, 1, 
                 60, 50, 100, 12, 100, output2file=True, out_file = 'full_dataset8a.csv')

4228
25 / 4228 events
50 / 4228 events
75 / 4228 events
100 / 4228 events
125 / 4228 events
150 / 4228 events
175 / 4228 events
200 / 4228 events
225 / 4228 events
250 / 4228 events
275 / 4228 events
300 / 4228 events
325 / 4228 events
350 / 4228 events
375 / 4228 events
400 / 4228 events
425 / 4228 events
450 / 4228 events
475 / 4228 events
500 / 4228 events
525 / 4228 events
550 / 4228 events
575 / 4228 events
600 / 4228 events
625 / 4228 events
650 / 4228 events
675 / 4228 events
700 / 4228 events
725 / 4228 events
750 / 4228 events
775 / 4228 events
800 / 4228 events
825 / 4228 events
850 / 4228 events
875 / 4228 events
900 / 4228 events
925 / 4228 events
950 / 4228 events
975 / 4228 events
1000 / 4228 events
1025 / 4228 events
1050 / 4228 events
1075 / 4228 events
1100 / 4228 events
1125 / 4228 events
1150 / 4228 events
1175 / 4228 events
1200 / 4228 events
1225 / 4228 events
1250 / 4228 events
1275 / 4228 events
1300 / 4228 events
1325 / 4228 events
1350 / 4228 events
1375 / 4228

Unnamed: 0,SOL_standard,event_starttime,event_endtime,event_peaktime,fl_goescls,hpc_bbox,hpc_radius,hpc_x,hpc_y,ar_noaanum,...,ef_hpc_coord,ef_hpc_radius,ef_hpc_x,ef_hpc_y,event_probability,area_atdiskcenteruncert,area_raw,area_uncert,area_unit,is_ef
0,SOL2011-03-01T14:35:00L160C064,2011-03-01T14:35:00,2011-03-01T14:40:00,2011-03-01T14:38:00,C1.1,"POLYGON((16.96746 138.9924,16.96746 138.9924,1...",676.817633,-436.41540,517.32360,11164.0,...,,,,,,,,,,0
1,SOL2011-03-01T17:42:00L167C066,2011-03-01T17:42:00,2011-03-01T17:54:00,2011-03-01T17:47:00,C1.3,"POLYGON((16.96692 139.0128,16.96692 139.0128,1...",589.308377,-318.03480,496.12320,11164.0,...,,,,,,,,,,0
2,SOL2011-03-01T18:06:00L154C062,2011-03-01T18:06:00,2011-03-01T18:26:00,2011-03-01T18:19:00,C1.0,"POLYGON((16.96686 139.0158,16.96686 139.0158,1...",723.181677,-479.34480,541.49820,11164.0,...,,,,,,,,,,0
3,SOL2011-03-01T18:51:00L165C066,2011-03-01T18:51:00,2011-03-01T18:58:00,2011-03-01T18:55:00,B9.8,"POLYGON((16.96674 139.0206,16.96674 139.0206,1...",604.061049,-346.73160,494.63820,11164.0,...,,,,,,,,,,0
4,SOL2011-03-02T10:02:00L164C066,2011-03-02T10:02:00,2011-03-02T10:10:00,2011-03-02T10:06:00,B8.8,"POLYGON((16.96404 139.1112,16.96404 139.1112,1...",550.064395,-229.68240,499.81680,11164.0,...,,,,,,,,,,0
5,SOL2011-03-02T13:11:00L163C066,2011-03-02T13:11:00,2011-03-02T13:21:00,2011-03-02T13:18:00,C1.4,"POLYGON((16.96344 139.128,16.96344 139.128,16....",544.430717,-214.68420,500.31540,11164.0,...,,,,,,,,,,0
7,SOL2011-03-03T14:12:00L163C068,2011-03-03T14:12:00,2011-03-03T14:20:00,2011-03-03T14:16:00,C5.4,"POLYGON((16.959 139.2396,16.959 139.2396,16.95...",474.685097,-15.71784,474.42480,11164.0,...,,,,,,,,,,0
8,SOL2011-03-03T19:24:00L162C068,2011-03-03T19:24:00,2011-03-03T19:44:00,2011-03-03T19:36:00,C1.2,"POLYGON((16.9581 139.2576,16.9581 139.2576,16....",474.681471,15.71700,474.42120,11164.0,...,,,,,,,,,,0
9,SOL2011-03-03T23:24:00L165C069,2011-03-03T23:24:00,2011-03-03T23:46:00,2011-03-03T23:34:00,C2.0,"POLYGON((16.95738 139.2702,16.95738 139.2702,1...",468.638565,94.78140,458.95380,11164.0,...,,,,,,,,,,0
10,SOL2011-03-03T23:53:00L163C068,2011-03-03T23:53:00,2011-03-04T00:04:00,2011-03-03T23:58:00,C2.1,"POLYGON((16.95726 139.272,16.95726 139.272,16....",480.448977,78.48420,473.99520,11164.0,...,,,,,,,,,,0


Find gaps in when event finding algorithms were run

In [98]:
def find_gaps(fname):
    dateparse1 = lambda x: pd.datetime.strptime(x, '%Y-%m-%dT%H:%M:%S')

    
    events = pd.read_csv(fname, delimiter=',', header=0, parse_dates=['event_starttime'],date_parser=dateparse1)
    
    gapmax = np.timedelta64(48, 'h')
    gaps = []
    biggapstart = []
    biggapend = []

    for i in range(events.shape[0]-1):
        if i==0: 
            print events['event_starttime'].values[0]
            print '------------------------'
        gap = events['event_starttime'].values[i+1] - events['event_starttime'].values[i]
        
        if gap>=gapmax:
            biggapstart.append(events['event_starttime'].values[i])
            biggapend.append(events['event_starttime'].values[i+1])
            print events['event_starttime'].values[i]
            print events['event_starttime'].values[i+1]
            print int((gap*10**(-9)/(24*3600))/np.timedelta64(1, 'ns'))
            print '------------------------'
        else: gaps.append(gap)
    gaps = gaps/np.timedelta64(1, 'ns')
    mean = np.mean(gaps)*10**(-9)/3600 #hr
    sigma = np.std(gaps)*10**(-9)/3600 #hr
    print mean 
    print sigma
    biggapstart = biggapstart + sigma*3*np.timedelta64(1, 'h')
    biggapend = biggapend - sigma*3*np.timedelta64(1, 'h')
    return biggapstart,biggapend

In [79]:
dateparseT = lambda x: pd.datetime.strptime(x, '%Y-%m-%dT%H:%M:%S')
def remove_flares_during_EF_data_gaps(fname):
    f = pd.read_csv(fname, header=0, delimiter=',', parse_dates=['adjusted_starttime'],
                    date_parser=dateparseT)
    gstart, gend = find_gaps('raw_ef1.csv')
    f.loc[:,'no_EF'] = [0 for i in range(f.shape[0])]
    for i, start in enumerate(gstart):
        j=0
        for fstart in f['adjusted_starttime']:
            if fstart>=start and fstart<=gend[i]:
                f['no_EF'].values[j] = 1
            j+=1
    print f.shape[0]
    f = f.ix[f['no_EF']==0]  
    print f.shape[0]
    outname = fname[:-4] + '_EF_friendly.csv'
    f.to_csv(outname, index=False, date_format = '%Y-%m-%dT%X')
    
    
    

In [109]:
remove_flares_during_EF_data_gaps('full_dataset8.csv')

2011-03-01T01:12:45.000000000
------------------------
2011-03-12T15:36:45.000000000
2011-03-16T00:48:45.000000000
3
------------------------
2011-04-11T14:18:00.000000000
2011-08-19T00:36:00.000000000
129
------------------------
2011-10-12T16:49:30.000000000
2011-10-14T23:15:45.000000000
2
------------------------
2013-10-10T16:05:15.000000000
2013-10-12T16:39:00.000000000
2
------------------------
2015-01-26T18:48:45.000000000
2015-01-30T22:18:00.000000000
4
------------------------
2015-03-06T15:53:15.000000000
2015-03-17T21:18:00.000000000
11
------------------------
1.04771061932
1.32905521914
68
bad ef
69
bad ef
70
bad ef
71
bad ef
72
bad ef
73
bad ef
74
bad ef
75
bad ef
76
bad ef
77
bad ef
78
bad ef
79
bad ef
80
bad ef
81
bad ef
133
bad ef
134
bad ef
135
bad ef
136
bad ef
137
bad ef
138
bad ef
139
bad ef
140
bad ef
141
bad ef
142
bad ef
143
bad ef
144
bad ef
145
bad ef
146
bad ef
147
bad ef
148
bad ef
149
bad ef
150
bad ef
151
bad ef
152
bad ef
153
bad ef
154
bad ef
155
bad ef

In [1]:
#determine goes class from revised peak flux
def get_gclass(fname):   
    fl = pd.read_csv(fname, delimiter=',', header=0)
    ks = ['K' for i in range(fl.shape[0])]
    fl.loc[:,'adjusted_goesclass'] = ks
    j=0
    for elem in fl['adjusted_peak']:
        if elem<10**4:
            fl['adjusted_goesclass'].values[j]='A'
        if elem>=10**4 and elem<10**5:
            fl['adjusted_goesclass'].values[j]='B'
        if elem>=10**5 and elem<10**6:
            fl['adjusted_goesclass'].values[j]='C'
        if elem>=10**6 and elem<10**7:
            fl['adjusted_goesclass'].values[j]='M'
        if elem>=10**7:
            fl['adjusted_goesclass'].values[j]='X'
        j+=1
    fl.to_csv(fname, index=False)

In [12]:
#determine flare rise time from revised start time and peak time
def get_revised_duration(fname):
    f = pd.read_csv(fname, delimiter=',', header=0, parse_dates=['adjusted_starttime', 'adjusted_peaktime'],
                    date_parser=dateparseT)
    f.loc[:, 'duration_sec'] = (f['adjusted_peaktime'] - f['adjusted_starttime'])*10**(-9)/np.timedelta64(1,'ns')
    f.to_csv(fname, index=False, date_format = '%Y-%m-%dT%X')

In [14]:
get_revised_duration('full_dataset7a.csv')

In [107]:
get_gclass('full_dataset8a.csv')

In [110]:
fl = pd.read_csv('full_dataset8_EF_friendly.csv', delimiter=',', header=0)

fl = fl.ix[fl['is_ef']==1]
fl = fl.ix[fl['no_EF']==0]

fl.to_csv('EF_8.csv', index=False)

In [111]:
fl = pd.read_csv('full_dataset8a_EF_friendly.csv', delimiter=',', header=0)

fl = fl.ix[fl['is_ef']==0]
fl = fl.ix[fl['no_EF']==0]

fl.to_csv('noEF_8.csv', index=False)

In [None]:
fl75 = pd.read_csv('full_dataset4a_EF_friendly.csv', delimiter = ',', header = 0)

fl75 = fl75.ix[fl75['num_associated_ef']==0]

i=0
for elem in fl75['adjusted_peak']:
    if elem==0:
        fl75['adjusted_peak'].values[i]=fl75['fl_peakflux_goes'].values[i]
    i+=1

fl75.to_csv('noEF75_4.csv', index=False)

In [117]:
fl100 = pd.read_csv('full_dataset4_EF_friendly.csv', delimiter = ',', header = 0)

fl100 = fl100.ix[fl100['is_ef']==1]
i=0
for elem in fl100['adjusted_peak']:
    if elem==0:
        fl100['adjusted_peak'].values[i]=fl100['fl_peakflux_goes'].values[i]
    i+=1

    
    

fl100.to_csv('EF100_4.csv', index=False)


In [29]:
def adjust_times2(inputF, outputF):
    dateparseT = lambda x: pd.datetime.strptime(x, '%Y-%m-%dT%H:%M:%S')
    dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')

    full = pd.read_csv(inputF, delimiter=',', header=0, 
                         parse_dates=['adjusted_starttime', 'adjusted_peaktime'],date_parser=dateparseT)
#     full = full.ix[full['associated_lc']==1]
#     full = full.ix[full['duration_sec']>=0]
    length = full.shape[0]
    full.loc[:,'adjusted_starttime2'] = ['%Y-%m-%dT%H:%M:%S' for x in range(length)]
    full.loc[:,'adjusted_peaktime2'] = ['%Y-%m-%dT%H:%M:%S' for x in range(length)]
    full.loc[:,'adjusted_peak2'] = [0 for x in range(length)]
    j=0
    for sol in full['SOL_standard']:
        fname = 'goes_event_lcs2/'+str(sol)+'.csv'
        lc = pd.read_csv(fname, delimiter = ',', header = 0, parse_dates=['date'],
                         date_parser=dateparse)
        lc = lc.ix[lc['date']<=(full['adjusted_peaktime'].values[j]+np.timedelta64(1, 'm'))]
        lc['xrsb'] = lc['xrsb']*10**11
        if lc.shape[0]==0: full['associated_lc'].values[j] = 0
        else:
            peak = np.argmax(np.array(lc['xrsb']))
            adjusted_peak = lc['xrsb'].values[peak]
            adjusted_peaktime = lc['date'].values[peak]
            full['adjusted_peaktime2'].values[j] = adjusted_peaktime
            full['adjusted_peak2'].values[j] = adjusted_peak
            if adjusted_peak==0: full['associated_lc'].values[j] = 0
            lc = lc.ix[lc['date']<=adjusted_peaktime]
            minutesX = ((lc['date'] - lc['date'].values[0])*10**(-9))
            minutesX = map(lambda x: x/np.timedelta64(1, 'ns'), minutesX)
            minutesXog = minutesX
            duration = minutesX[-1]
            maxi = np.argmax(np.array(lc['xrsb'])) 
            softx = lc['xrsb'].values[0:maxi+1]
            if len(softx)<=7:full['associated_lc'].values[j] = 0
            else:
                minutesX = minutesX[0:maxi+1]
                searchzone=True
                endsearchidx = maxi
                for k, x in enumerate(softx):
                    if searchzone:
                        if (x-softx[0])/(softx[-1]-softx[0]) > 0.31053736899:
                            searchzone=False
                            endsearchidx = k
                softx = softx[:endsearchidx+6]           
                minutesX = minutesX[0:endsearchidx+6]
                i=0
                derivatives = []
                for time in minutesX[:-6]:
                    delta_t = minutesX[i+5]-time
                    delta_flux = softx[i+5]-softx[i]
                    derivative = delta_flux/delta_t
                    derivatives.append(derivative)
                    i+=1
                der2s = []
                i=0
                for time in minutesX[:-7]:
                    delta_t = minutesX[i+5]-time
                    delta_der = derivatives[i+1]-derivatives[i]
                    der2 = delta_der/delta_t
                    der2s.append(der2)
                    i+=1 
                foTran4, ifoTran4 = eliminate_noise_graph(minutesX[:-7], der2s, 5)
                maxidx4 = np.argmax(np.array(ifoTran4)) 
                full['adjusted_starttime2'].values[j] = lc['date'].values[maxidx4]

        j+=1
#     full['adjusted_starttime'] = map(parse_time, full['adjusted_starttime'])
#     full['adjusted_peaktime'] = map(parse_time, full['adjusted_peaktime'])
    full.to_csv(outputF, index=False, date_format = '%Y-%m-%dT%X')

In [30]:
adjust_times2('EF_7.csv', 'EF_7_times.csv')

In [31]:
adjust_times2('noEF_7.csv', 'noEF_7_times.csv')

In [51]:
def correct_for_bad_lcs2(inputf,outputf):
    f = pd.read_csv(inputf, header=0, delimiter=',')
    f['adjusted_starttime2'] = map(str,f['adjusted_starttime2'])
    f['adjusted_peaktime2'] = map(str,f['adjusted_peaktime2'])
    j = 0
    for time in f['adjusted_starttime2']:
        if time=='%Y-%m-%dT%H:%M:%S':
            f['associated_lc'].values[j] =0
            f['adjusted_starttime2'].values[j] = f['event_starttime'].values[j]+'.000000000'
        j+=1
    j=0
    for time in f['adjusted_peaktime2']:
        if time=='%Y-%m-%dT%H:%M:%S':
            f['associated_lc'].values[j] =0
            f['adjusted_peaktime2'].values[j] = f['event_peaktime'].values[j]+'.000000000'
        j+=1
    j=0
    for peak in f['adjusted_peak2']:
        if peak==0.0:
            f['associated_lc'].values[j] = 0
            f['adjusted_peak2'].values[j] = f['fl_peakflux_goes'].values[j]
        j+=1
    f.to_csv(outputf, index=False)


In [52]:
correct_for_bad_lcs2('EF_7_times.csv','EF_7_times2.csv')

In [61]:
def get_revised_duration(fname):
    f = pd.read_csv(fname, delimiter=',', header=0, parse_dates=['adjusted_starttime2', 'adjusted_peaktime2'],
                    date_parser=dateparseT)
    f['duration_sec'] = (f['adjusted_peaktime2'] - f['adjusted_starttime2'])*10**(-9)/np.timedelta64(1,'ns')
    f.to_csv(fname, index=False, date_format = '%Y-%m-%dT%X')
    