In [1]:
import pandas

In [2]:
# read all years of iowa precipitation data in a single data frame
# get rid of all columns except for the last column
years = range(1980, 2011)
iowa = pandas.DataFrame()
for year in years:
            #create a path based on year
            path = './raw_data/iowa_%d.csv' % year
            #read one file from raw data
            frame = pandas.read_csv(path)
            #drop  first four columns
            frame.drop(frame.columns[[0,1,2,3]], axis=1, inplace=True)
            # add frame that we just read to the bottom of container
            iowa = iowa.append(frame)

In [3]:
#check the shape of data frame that we got
iowa.shape

(11323, 1)

In [4]:
# create another frame based on summation of the 15 day window
sums = pandas.DataFrame()
for row_num in range(15,iowa.shape[0]-15):
    single_sum = iowa[row_num:row_num+15].sum(axis=0)
    sums = sums.append(single_sum, ignore_index=True)

In [5]:
# check the shape of the sums
sums.shape

(11293, 1)

In [6]:
#check if values make sense 
sums.head()

Unnamed: 0,Unnamed: 4
0,0.959486
1,0.447076
2,0.232786
3,0.232786
4,0.188976


In [7]:
# create dummy csv file filed with zeros to pad the bottom of the
some_zeros = pandas.read_csv('./raw_data/zeros.csv')

In [8]:
some_zeros.shape

(21, 1)

In [12]:
#add zeros data frame to the bottom of sums to get it to the 
# features dataframe dimensionality
res = sums.append(some_zeros, ignore_index=True)

In [13]:
# should be 11314 x 1
res.shape

(11314, 1)

In [35]:
# now we sort the data frame with sums (and already padded with zeros)
res_sorted = res.sort_values(['Unnamed: 4'], ascending=False)

In [36]:
#check if sorting worked
res_sorted.head()

Unnamed: 0,Unnamed: 4
10362,8.202748
10361,8.069108
4914,7.644392
4913,7.146769
4918,7.049467


In [1]:
# to get top 5% here is what we need to do.
# the size of the data frame is 11314 -> 100% 
# to get 5% we 11314 / 20 = 565 
num_of_1s = 11314 / 20

In [2]:
num_of_1s

565

In [52]:
#get the threshold value to be used for labeling sums
threshold = res_sorted.iloc[num_of_1s - 1]
threshold

Unnamed: 4    3.581626
Name: 3828, dtype: float64

In [80]:
# create an empty data frame to be populated with labels
labels = pandas.DataFrame(columns=['labels'])
# read dummy frame that just contains 1
one = pandas.read_csv('./raw_data/one.csv')
# read dummy frame that just contains 0
zero = pandas.read_csv('./raw_data/zero.csv')
#for each entry in a iowa container
for i in range(0,len(res)):
    #read the value
    x = res.iloc[i]
    #if value is larger than threshold append 1 to labels else 0
    if ((x > threshold).bool()):   
        labels = labels.append(one, ignore_index=True)        
    else: labels = labels.append(zero, ignore_index=True)

In [81]:
labels.shape

(11314, 1)

In [87]:
# finally save the file to be used later
labels.to_csv(r'./sample_data/labels.csv', header='labels',index=None,sep=',')