In [None]:
import pandas as pd
import matplotlib
%matplotlib inline 
import numpy as np

In [None]:
### Data transformation from previous notebooks
nyc = pd.read_csv('../data/central-park-raw.csv', parse_dates=[0])
nyc.columns = [x.strip() for x in nyc.columns]
nyc.columns = [x.replace(' ', '_') for x in nyc.columns]
nyc.PrecipitationIn.replace("T", '0.001')
nyc.PrecipitationIn = pd.to_numeric(nyc.PrecipitationIn.replace("T", '0.001'))
nyc['Events'] = nyc.Events.fillna('')

# Machine Learning

Pandas allows gives us easy integration with the sklearn library. Let's see if we can 
predict humidity (``y``) from the other columns (``X``).

We will train a Random Forest with a sample of our data, then test it with another sample to see how it performs.

In [146]:
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.model_selection import train_test_split


In [147]:
nyc.columns

Index(['EST', 'Max_TemperatureF', 'Mean_TemperatureF', 'Min_TemperatureF',
       'Max_Dew_PointF', 'MeanDew_PointF', 'Min_DewpointF', 'Max_Humidity',
       'Mean_Humidity', 'Min_Humidity', 'Max_Sea_Level_PressureIn',
       'Mean_Sea_Level_PressureIn', 'Min_Sea_Level_PressureIn',
       'Max_VisibilityMiles', 'Mean_VisibilityMiles', 'Min_VisibilityMiles',
       'Max_Wind_SpeedMPH', 'Mean_Wind_SpeedMPH', 'Max_Gust_SpeedMPH',
       'PrecipitationIn', 'CloudCover', 'Events', 'WindDirDegrees'],
      dtype='object')

In [148]:
# Shift Humidity down to predict next day
pd.concat([nyc.Mean_Humidity, nyc.Mean_Humidity.shift(1)], axis=1)

Unnamed: 0,Mean_Humidity,Mean_Humidity.1
0,74.0,
1,71.0,74.0
2,84.0,71.0
3,72.0,84.0
4,71.0,72.0
5,60.0,71.0
6,51.0,60.0
7,56.0,51.0
8,60.0,56.0
9,52.0,60.0


In [149]:
# Regression - Try to predict Mean_Humidity (y) from non humidity columns (X)
# Get training set (X_train)
# Shift Humidity down to predict next day
X = nyc[[x for x in nyc.columns if 'Humid' not in x]]
y = nyc.Mean_Humidity.shift(1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [150]:
# Create a model 
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)



ValueError: could not convert string to float: 

In [151]:
# Need to make "Dummy" variables from Events column
nyc_dummy = pd.get_dummies(nyc, columns=['Events'])
nyc_dummy.head()

Unnamed: 0,EST,Max_TemperatureF,Mean_TemperatureF,Min_TemperatureF,Max_Dew_PointF,MeanDew_PointF,Min_DewpointF,Max_Humidity,Mean_Humidity,Min_Humidity,...,WindDirDegrees,Events_,Events_Fog,Events_Fog-Rain,Events_Fog-Rain-Snow,Events_Fog-Snow,Events_Rain,Events_Rain-Snow,Events_Snow,Events_Thunderstorm
0,2006-01-01,42.0,37.0,32.0,32.0,30.0,28.0,85.0,74.0,62.0,...,276.0,1,0,0,0,0,0,0,0,0
1,2006-01-02,48.0,44.0,39.0,38.0,34.0,29.0,92.0,71.0,49.0,...,76.0,0,0,0,0,0,1,0,0,0
2,2006-01-03,40.0,37.0,33.0,38.0,33.0,26.0,92.0,84.0,75.0,...,39.0,0,0,0,0,0,1,0,0,0
3,2006-01-04,38.0,34.0,29.0,36.0,26.0,19.0,85.0,72.0,59.0,...,70.0,1,0,0,0,0,0,0,0,0
4,2006-01-05,50.0,44.0,37.0,38.0,35.0,32.0,92.0,71.0,50.0,...,251.0,0,0,0,0,0,1,0,0,0


In [152]:
# Regression - Try to predict Mean_Humidity (y) from non humidity columns (X)
# Get training set (X_train)
X = nyc_dummy[[x for x in nyc_dummy.columns if 'Humid' not in x]]
y = nyc_dummy.Mean_Humidity.shift(1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [153]:
X.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3277,3278,3279,3280,3281,3282,3283,3284,3285,3286
EST,2006-01-01 00:00:00,2006-01-02 00:00:00,2006-01-03 00:00:00,2006-01-04 00:00:00,2006-01-05 00:00:00,2006-01-06 00:00:00,2006-01-07 00:00:00,2006-01-08 00:00:00,2006-01-09 00:00:00,2006-01-10 00:00:00,...,2014-12-22 00:00:00,2014-12-23 00:00:00,2014-12-24 00:00:00,2014-12-25 00:00:00,2014-12-26 00:00:00,2014-12-27 00:00:00,2014-12-28 00:00:00,2014-12-29 00:00:00,2014-12-30 00:00:00,2014-12-31 00:00:00
Max_TemperatureF,42,48,40,38,50,43,35,46,60,49,...,44,46,58,62,50,55,54,44,34,32
Mean_TemperatureF,37,44,37,34,44,37,30,40,52,45,...,40,45,51,53,45,50,49,39,31,30
Min_TemperatureF,32,39,33,29,37,30,25,34,43,41,...,35,43,44,44,40,44,43,34,28,27
Max_Dew_PointF,32,38,38,36,38,33,19,35,39,31,...,42,44,57,60,29,35,43,25,17,12
MeanDew_PointF,30,34,33,26,35,24,14,25,36,28,...,35,42,47,40,28,31,37,19,13,8
Min_DewpointF,28,29,26,19,32,14,11,19,30,26,...,29,41,43,27,27,29,26,15,8,5
Max_Sea_Level_PressureIn,30.2,30.24,30.05,30.09,29.81,29.82,29.99,30.1,30.25,30.5,...,30.35,30.18,30.08,30.12,30.25,30.27,30.07,30.26,30.4,30.35
Mean_Sea_Level_PressureIn,30.03,30.15,29.93,29.96,29.71,29.72,29.93,30.04,30.01,30.42,...,30.27,30.12,29.79,29.74,30.2,30.19,29.95,30.18,30.36,30.28
Min_Sea_Level_PressureIn,29.83,29.93,29.83,29.79,29.63,29.67,29.84,29.95,29.92,30.28,...,30.16,30.07,29.53,29.48,30.13,30.07,29.88,30.09,30.27,30.18


In [154]:
# Create a model (whoops data needs to be floats)
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)



TypeError: float() argument must be a string or a number, not 'Timestamp'

In [155]:
# Need to remove timestamp
# Regression - Try to predict Mean_Humidity (y) from non humidity columns (X)
# Get training set (X_train)
def valid(col):
    return 'Humid' not in col and 'EST' not in col
X = nyc_dummy[[x for x in nyc_dummy.columns if valid(x)]]
y = nyc_dummy.Mean_Humidity.shift(1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [156]:
# Create a model 
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)



ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [157]:
# Need to remove NA
# Regression - Try to predict Mean_Humidity (y) from non humidity columns (X)
# Get training set (X_train)
def valid(col):
    return 'Humid' not in col and 'EST' not in col
nyc_dummy = nyc_dummy.dropna()
X = nyc_dummy[[x for x in nyc_dummy.columns if valid(x)]].iloc[1:]
y = nyc_dummy.Mean_Humidity.shift(1).dropna()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [158]:
# Create a model 
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [159]:
# Get R2 measure (indicator of accuracy 1 is perfect 0 is horrible)
rf_model.score(X_test, y_test)

0.3562061388889517

In [160]:
type(y_test)

pandas.core.series.Series

In [161]:
pd.concat([pd.Series(rf_model.predict(X_test)), y_test.reset_index(
drop=True)], axis=1)

Unnamed: 0,0,Mean_Humidity
0,61.4,47.0
1,53.4,59.0
2,63.3,71.0
3,49.5,47.0
4,53.1,77.0
5,54.9,65.0
6,61.2,67.0
7,60.2,64.0
8,71.1,73.0
9,65.6,53.0


In [162]:
sorted(zip(X.columns, rf_model.feature_importances_),
        key=lambda x: x[1], reverse=True)

[('Max_Dew_PointF', 0.14953228301662783),
 ('Min_DewpointF', 0.11764905160694469),
 ('WindDirDegrees', 0.093517104395916345),
 ('Min_Sea_Level_PressureIn', 0.082659446552322574),
 ('Max_TemperatureF', 0.078141984905771206),
 ('Max_Sea_Level_PressureIn', 0.063172755535206418),
 ('MeanDew_PointF', 0.056291331387023115),
 ('Min_TemperatureF', 0.042050388819334553),
 ('Mean_Sea_Level_PressureIn', 0.040340626769935911),
 ('Max_Gust_SpeedMPH', 0.036919915677102892),
 ('Mean_TemperatureF', 0.036231238819004773),
 ('CloudCover', 0.034648492945207436),
 ('Mean_VisibilityMiles', 0.034024414050374173),
 ('Max_Wind_SpeedMPH', 0.033104252565273662),
 ('Mean_Wind_SpeedMPH', 0.031757581344492102),
 ('PrecipitationIn', 0.030008682405904603),
 ('Min_VisibilityMiles', 0.027867402876082425),
 ('Events_Rain', 0.0036681179586609005),
 ('Max_VisibilityMiles', 0.0027966229674440389),
 ('Events_', 0.0020286391676834579),
 ('Events_Fog-Rain', 0.0014038573267167115),
 ('Events_Fog-Snow', 0.001210694257606179),


## Machine Learning Assignment
* Using the nino dataset, see if you can predict what the temperature (``air_temp_F``) will be for the next day 

In [None]:
# Data transformation from previous notebook
# col names in tao-all2.col from website
names = '''obs
year
month
day
date
latitude
longitude
zon.winds
mer.winds
humidity
air temp.
s.s.temp.'''.split('\n')

nino = pd.read_csv('../data/tao-all2.dat.gz', sep=' ', names=names, na_values='.', 
                   parse_dates=[[1,2,3]])
nino.columns = [x.replace('.', '_').replace(' ', '_') for x in nino.columns]
nino['air_temp_F'] = nino.air_temp_ * 9/5 + 32
wind_cols = [x for x in nino.columns if x.endswith('winds')]
for c in wind_cols:
    nino['{}_mph'.format(c)] = nino[c] * 2.237
pd.to_datetime(nino.date, format='%y%m%d')
nino = nino.drop('obs', axis=1)

In [163]:
nino.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,178070,178071,178072,178073,178074,178075,178076,178077,178078,178079
year_month_day,1980-03-07 00:00:00,1980-03-08 00:00:00,1980-03-09 00:00:00,1980-03-10 00:00:00,1980-03-11 00:00:00,1980-03-12 00:00:00,1980-03-13 00:00:00,1980-03-14 00:00:00,1980-03-15 00:00:00,1980-03-16 00:00:00,...,1998-06-06 00:00:00,1998-06-07 00:00:00,1998-06-08 00:00:00,1998-06-09 00:00:00,1998-06-10 00:00:00,1998-06-11 00:00:00,1998-06-12 00:00:00,1998-06-13 00:00:00,1998-06-14 00:00:00,1969-12-31 23:59:59.999999996
date,800307,800308,800309,800310,800311,800312,800313,800314,800315,800316,...,980606,980607,980608,980609,980610,980611,980612,980613,980614,-4
latitude,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,...,8.96,8.95,8.96,8.98,8.95,8.96,8.96,8.95,8.96,-4
longitude,-109.46,-109.46,-109.46,-109.46,-109.46,-109.46,-109.46,-109.46,-109.46,-109.46,...,-140.33,-140.33,-140.33,-140.33,-140.33,-140.33,-140.32,-140.34,-140.33,-4
zon_winds,-6.8,-4.9,-4.5,-3.8,-4.2,-4.4,-3.2,-3.1,-3,-1.2,...,-6.6,-8.4,-8.4,-6.5,-6.8,-5.1,-4.3,-6.1,-4.9,-4
mer_winds,0.7,1.1,2.2,1.9,1.5,0.3,0.1,0.6,1,1,...,-4.3,-4.2,-5,-5.9,-5.3,-0.4,-3.3,-4.8,-2.3,-4
humidity,,,,,,,,,,,...,81.3,83.5,79.2,75.4,81.3,94.1,93.2,81.3,76.2,-4
air_temp_,26.14,25.66,25.69,25.57,25.3,24.72,24.66,25.17,25.59,26.71,...,27.71,27.91,27.87,27.56,27.52,26.04,25.8,27.17,27.36,-4
s_s_temp_,26.24,25.97,25.28,24.31,23.19,23.64,24.34,24.14,24.24,25.94,...,28.28,28.26,28.22,28.22,28.17,28.14,27.87,27.93,28.03,-4
air_temp_F,79.052,78.188,78.242,78.026,77.54,76.496,76.388,77.306,78.062,80.078,...,81.878,82.238,82.166,81.608,81.536,78.872,78.44,80.906,81.248,-4


In [164]:
y = nino.air_temp_F.shift()
y


0            NaN
1         79.052
2         78.188
3         78.242
4         78.026
5         77.540
6         76.496
7         76.388
8         77.306
9         78.062
10        80.078
11        81.104
12        80.348
13        79.484
14        79.142
15        78.944
16        79.232
17        78.890
18        78.206
19        77.702
20        77.306
21        77.450
22        77.630
23        79.034
24        70.664
25        70.286
26        69.998
27        69.710
28        71.168
29        71.798
           ...  
178050    80.546
178051    80.618
178052    80.762
178053    81.104
178054    81.104
178055    81.032
178056    81.320
178057    81.176
178058    80.060
178059    81.248
178060    81.176
178061    80.762
178062    80.276
178063    79.916
178064    80.402
178065    81.392
178066    79.916
178067    81.680
178068    82.166
178069    81.950
178070    81.950
178071    81.878
178072    82.238
178073    82.166
178074    81.608
178075    81.536
178076    78.872
178077    78.

In [165]:
X = nino #nyc_dummy[[x for x in nyc_dummy.columns if 'Humid' not in x]]
y = nino.air_temp_F.shift() #nyc_dummy.Mean_Humidity.shift(1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [166]:
# Create a model 
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)

TypeError: float() argument must be a string or a number, not 'Timestamp'

In [None]:
X = nino[[x for x in nino.columns if x != 'year_month_day']] 
y = nino.air_temp_F.shift(1) 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [167]:
# Create a model 
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)

TypeError: float() argument must be a string or a number, not 'Timestamp'

In [168]:
X = nino[[x for x in nino.columns if x != 'year_month_day']].fillna(0)
y = nino.air_temp_F.shift(1) 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [169]:
# Create a model 
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [170]:
X[np.isinf(X)].any()


date             False
latitude         False
longitude        False
zon_winds        False
mer_winds        False
humidity         False
air_temp_        False
s_s_temp_        False
air_temp_F       False
zon_winds_mph    False
mer_winds_mph    False
year             False
dtype: bool

In [171]:
X[np.isnan(X)].any()


date             False
latitude         False
longitude        False
zon_winds        False
mer_winds        False
humidity         False
air_temp_        False
s_s_temp_        False
air_temp_F       False
zon_winds_mph    False
mer_winds_mph    False
year             False
dtype: bool

In [172]:
np.isnan(y)


0          True
1         False
2         False
3         False
4         False
5         False
6         False
7         False
8         False
9         False
10        False
11        False
12        False
13        False
14        False
15        False
16        False
17        False
18        False
19        False
20        False
21        False
22        False
23        False
24        False
25        False
26        False
27        False
28        False
29        False
          ...  
178050    False
178051    False
178052    False
178053    False
178054    False
178055    False
178056    False
178057    False
178058    False
178059    False
178060    False
178061    False
178062    False
178063    False
178064    False
178065    False
178066    False
178067    False
178068    False
178069    False
178070    False
178071    False
178072    False
178073    False
178074    False
178075    False
178076    False
178077    False
178078    False
178079    False
Name: air_temp_F, Lengt

In [173]:
X = nino[[x for x in nino.columns if x != 'year_month_day']].fillna(0).iloc[1:]
y = nino.air_temp_F.shift(1).iloc[1:].fillna(0)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
# Create a model 
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [174]:
rf_model.score(X_test, y_test)

0.85868449344197473

In [175]:
sorted(zip(X.columns, rf_model.feature_importances_),
        key=lambda x: x[1], reverse=True)

[('air_temp_F', 0.4569108431045491),
 ('air_temp_', 0.45535301699714853),
 ('date', 0.017847800586938387),
 ('s_s_temp_', 0.015685121872361429),
 ('humidity', 0.0098803070005559906),
 ('longitude', 0.0091521191291376834),
 ('latitude', 0.0085337778065496007),
 ('zon_winds_mph', 0.0062679731202558483),
 ('zon_winds', 0.0062275015205750846),
 ('mer_winds', 0.0062220660054400651),
 ('mer_winds_mph', 0.0059965045283915837),
 ('year', 0.0019229683280967825)]

In [176]:
pd.concat([pd.Series(rf_model.predict(X_test)), y_test.reset_index(
drop=True)], axis=1)

Unnamed: 0,0,air_temp_F
0,81.771800,82.004
1,75.104600,75.218
2,81.572000,82.094
3,0.692122,-4.000
4,58.270000,72.752
5,77.829800,77.684
6,77.702000,77.594
7,82.369400,80.816
8,76.665200,77.054
9,78.486800,81.716
