# Clean Individual File

In [78]:
import pandas as pd
import numpy as np 
from io import StringIO

In [52]:
filename = 'Copy of daily_AOD_conc_MOD04_L2_201701.txt'
with open('MODIS_data/Copy of daily_AOD_conc_MOD04_L2_201701.txt') as f:
    lines = f.readlines()
lines

['Day hour min     AOD QC2     AOD QC3  conc (um/cm2) avgbinQC2 avgbinQC3\n',
 ' 01   11  15     0.32717     0.33800       13.89036     3           3\n',
 ' 02   10  20     0.16067     0.15500        6.06734     1           1\n',
 ' 03   11  01     0.21500     0.21500        8.09774     0           0\n',
 ' 05   10  50     0.29400     0.29400       10.82038     0           0\n',
 ' 07   10  36     0.27700     0.27700       10.25618     0           0\n',
 ' 08   11  20     0.19200     0.19200        7.25462     0           0\n',
 ' 09   10  25     0.17500     0.16120        6.61571     0           1\n',
 ' 10   11  06     0.21800     0.22225       10.89078     1           1\n',
 ' 11   10  11     0.29900     0.27600       11.41530     1           1\n',
 ' 12   10  55     0.45000     0.45000       15.53205     0           0\n',
 ' 14   10  41     0.30000     0.30000       11.00745     0           0\n',
 ' 15   11  25     0.28600     0.29750       18.50253     1           3\n',
 ' 16   10

In [53]:
new_lines = [line.split() for line in lines][1:]
clean_string = '\n'.join([','.join(line) for line in new_lines])

# Unorganized column names -> need to write manually
cols = ['day','hour','min','AOD_QC2','AOD_QC3','conc','avgbinQC2','avgbinQC3']

csv_string = StringIO(clean_string)
df = pd.read_csv(csv_string,names=cols)
df.head()

Unnamed: 0,day,hour,min,AOD_QC2,AOD_QC3,conc,avgbinQC2,avgbinQC3
0,1,11,15,0.32717,0.338,13.89036,3,3
1,2,10,20,0.16067,0.155,6.06734,1,1
2,3,11,1,0.215,0.215,8.09774,0,0
3,5,10,50,0.294,0.294,10.82038,0,0
4,7,10,36,0.277,0.277,10.25618,0,0


In [75]:
date_str = filename[-10:-4] # Assume 6 chars preceding ".txt" or ".csv" ALWAYS denote date
try:
    year = int(date_str[:4])
    month = int(date_str[4:])
except ValueError:
    raise ValueError('Incorrectly Positioned date in file name')

In [99]:
year_col = pd.Series(np.ones(df.shape[0])*year).astype(int)
month_col = pd.Series(np.ones(df.shape[0])*month).astype(int)
df_dates = pd.DataFrame([year_col,month_col]).T
df_dates.columns=['year','month']
df_dates

Unnamed: 0,year,month
0,2017,1
1,2017,1
2,2017,1
3,2017,1
4,2017,1
5,2017,1
6,2017,1
7,2017,1
8,2017,1
9,2017,1


In [100]:
result_df = pd.concat([df_dates,df], axis=1)

Unnamed: 0,year,month,day,hour,min,AOD_QC2,AOD_QC3,conc,avgbinQC2,avgbinQC3
0,2017,1,1,11,15,0.32717,0.338,13.89036,3,3
1,2017,1,2,10,20,0.16067,0.155,6.06734,1,1
2,2017,1,3,11,1,0.215,0.215,8.09774,0,0
3,2017,1,5,10,50,0.294,0.294,10.82038,0,0
4,2017,1,7,10,36,0.277,0.277,10.25618,0,0
5,2017,1,8,11,20,0.192,0.192,7.25462,0,0
6,2017,1,9,10,25,0.175,0.1612,6.61571,0,1
7,2017,1,10,11,6,0.218,0.22225,10.89078,1,1
8,2017,1,11,10,11,0.299,0.276,11.4153,1,1
9,2017,1,12,10,55,0.45,0.45,15.53205,0,0


In [102]:
def get_df(path):
    
    filename = path.split('/')[-1]
    
    with open(path) as f:
        lines = f.readlines()
    
    new_lines = [line.split() for line in lines][1:]
    clean_string = '\n'.join([','.join(line) for line in new_lines])

    # Unorganized column names -> need to write manually
    cols = ['day','hour','min','AOD_QC2','AOD_QC3','conc','avgbinQC2','avgbinQC3']

    csv_string = StringIO(clean_string)
    df = pd.read_csv(csv_string,names=cols)
    
    date_str = filename[-10:-4] # Assume 6 chars preceding ".txt" or ".csv" ALWAYS denote date
    try:
        year = int(date_str[:4])
        month = int(date_str[4:])
    except ValueError:
        raise ValueError('Incorrectly positioned date in file name')
        
    year_col = pd.Series(np.ones(df.shape[0])*year).astype(int)
    month_col = pd.Series(np.ones(df.shape[0])*month).astype(int)
    df_dates = pd.DataFrame([year_col,month_col]).T
    df_dates.columns=['year','month']
    
    return pd.concat([df_dates,df], axis=1)

In [103]:
get_df('MODIS_data/Copy of daily_AOD_conc_MOD04_L2_201701.txt')

Unnamed: 0,year,month,day,hour,min,AOD_QC2,AOD_QC3,conc,avgbinQC2,avgbinQC3
0,2017,1,1,11,15,0.32717,0.338,13.89036,3,3
1,2017,1,2,10,20,0.16067,0.155,6.06734,1,1
2,2017,1,3,11,1,0.215,0.215,8.09774,0,0
3,2017,1,5,10,50,0.294,0.294,10.82038,0,0
4,2017,1,7,10,36,0.277,0.277,10.25618,0,0
5,2017,1,8,11,20,0.192,0.192,7.25462,0,0
6,2017,1,9,10,25,0.175,0.1612,6.61571,0,1
7,2017,1,10,11,6,0.218,0.22225,10.89078,1,1
8,2017,1,11,10,11,0.299,0.276,11.4153,1,1
9,2017,1,12,10,55,0.45,0.45,15.53205,0,0
