In [101]:
import os
import pandas as pd
import numpy as np
import datetime as datetime
import matplotlib.pyplot as plt
%matplotlib inline

CSal=['61024Agadez','61045Goure','61052Niamey','64753Faya',
'61017Bilma']   

fname = CSal[2]

column_names=["year","month","day","hour","ws"]
dtype={"year":int,"month":int,"day":int,"hour":int,"ws":float}

datafile='/home/sophie/projects/windspeed/data/%s_allwinds.txt' %fname

# specify the columns you want to group together. Can't include hour at 
# this point as it is not in the right format. 
date_spec = {'date_time': [0,1,2]}

# when you use keep_dat_col it keeps them as objects, not as the dtype you 
# read them in as.
wind = pd.read_csv(datafile, sep=" ", names=column_names, 
parse_dates=date_spec,   keep_date_col=True, index_col=False ) 

# Dealing with hour - going from 600, 1200 etc to 6,12, 18
wind["hour"]=(wind["hour"]/100).astype(int)

# combining year, month, day that were parsed together into date_time with 
# hour, which is now in the correct format.
wind['date_time'] = pd.to_datetime(wind.date_time) + \
wind.hour.astype('timedelta64[h]')

# make datetime the index before making subsections.
wind.index = wind['date_time']  


In [102]:
# I think the index has two identical values. Find them and see how this might have happened. 
print (wind[wind.duplicated()])

                              date_time  year month day  hour       ws
date_time                                                             
1999-03-16 11:00:00 1999-03-16 11:00:00  1999     3  16    11  1.72353
2007-09-26 11:00:00 2007-09-26 11:00:00  2007     9  26    11  2.90644
2007-09-26 11:00:00 2007-09-26 11:00:00  2007     9  26    11  2.90644


In [103]:
# By not specifying a named column it will check the whole row for duplicates
wind.drop_duplicates(['date_time'],inplace=True)

print (wind[wind.duplicated()])

Empty DataFrame
Columns: [date_time, year, month, day, hour, ws]
Index: []


In [104]:
wind.index.value_counts()

1993-11-07 18:00:00    1
2009-12-19 21:00:00    1
2011-10-29 15:00:00    1
1993-09-17 15:00:00    1
1990-04-15 15:00:00    1
1987-04-04 12:00:00    1
1990-10-18 03:00:00    1
1999-09-29 00:00:00    1
2008-07-16 06:00:00    1
2000-02-20 18:00:00    1
2003-08-18 06:00:00    1
2009-03-13 03:00:00    1
2005-10-10 00:00:00    1
2002-05-08 00:00:00    1
1998-12-04 00:00:00    1
1995-07-02 00:00:00    1
1988-10-27 06:00:00    1
1988-08-25 00:00:00    1
1986-05-14 00:00:00    1
1984-01-31 00:00:00    1
2004-03-10 00:00:00    1
2004-11-29 00:00:00    1
1998-11-30 18:00:00    1
2012-08-28 06:00:00    1
1987-10-11 00:00:00    1
1999-03-24 12:00:00    1
1998-03-16 18:00:00    1
2011-04-14 12:00:00    1
1993-05-10 00:00:00    1
2004-04-23 00:00:00    1
                      ..
2008-08-01 18:00:00    1
2005-02-27 18:00:00    1
2001-09-22 00:00:00    1
1999-06-14 18:00:00    1
1996-01-10 18:00:00    1
2010-04-06 21:00:00    1
2008-08-31 18:00:00    1
1993-03-27 21:00:00    1
1996-08-03 12:00:00    1


In [105]:
print(wind[0:20])
print(list(wind))
# drop the date_time columns?

wind.drop('date_time', axis=1, inplace=True)

                              date_time  year month day  hour       ws
date_time                                                             
1984-01-01 00:00:00 1984-01-01 00:00:00  1984     1   1     0  4.74925
1984-01-01 06:00:00 1984-01-01 06:00:00  1984     1   1     6  5.32628
1984-01-01 12:00:00 1984-01-01 12:00:00  1984     1   1    12  6.45924
1984-01-01 18:00:00 1984-01-01 18:00:00  1984     1   1    18  4.52152
1984-01-02 00:00:00 1984-01-02 00:00:00  1984     1   2     0  5.69514
1984-01-02 06:00:00 1984-01-02 06:00:00  1984     1   2     6  5.33187
1984-01-02 12:00:00 1984-01-02 12:00:00  1984     1   2    12  6.61845
1984-01-02 18:00:00 1984-01-02 18:00:00  1984     1   2    18  4.67153
1984-01-03 00:00:00 1984-01-03 00:00:00  1984     1   3     0  5.48648
1984-01-03 06:00:00 1984-01-03 06:00:00  1984     1   3     6  5.20784
1984-01-03 12:00:00 1984-01-03 12:00:00  1984     1   3    12  6.45309
1984-01-03 18:00:00 1984-01-03 18:00:00  1984     1   3    18  4.46606
1984-0

In [107]:
def meanf(x):
    if x.count() > 10:
        return x.mean()
def sdf(x):
    if x.count() > 10:
        return x.std()

# Adds extra rows where value is kept if it meets isin() criteria. Nan if 
# it doesn't.
wind['ws_0']= wind['ws'][wind['hour'].isin([0])]
wind['ws_06']= wind['ws'][wind['hour'].isin([6])]
wind['ws_12']= wind['ws'][wind['hour'].isin([12])]
wind['ws_18']= wind['ws'][wind['hour'].isin([18])]

group = wind.groupby(['year', 'month'])

wind_group = group['ws','ws_0','ws_06','ws_12','ws_18'].agg([meanf,sdf])
