<p style="font-family: Arial; font-size:2.75em;color:purple; font-style:bold"><br>

Classification with scikit-learn

<br><br></p>


In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

<p style="font-family: Arial; font-size:1.75em;color:purple; font-style:bold"><br>

Creating a Pandas DataFrame from a CSV file<br><br></p>


In [2]:
data = pd.read_csv('./weather/daily_weather.csv')

In [3]:
data.columns

Index(['number', 'air_pressure_9am', 'air_temp_9am', 'avg_wind_direction_9am',
       'avg_wind_speed_9am', 'max_wind_direction_9am', 'max_wind_speed_9am',
       'rain_accumulation_9am', 'rain_duration_9am', 'relative_humidity_9am',
       'relative_humidity_3pm'],
      dtype='object')

<p style="font-family: Arial; font-size:1.75em;color:purple; font-style:bold"><br>

Clean the data<br><br></p>


In [4]:
del data['number']

In [5]:
before_rows = data.shape[0]

In [6]:
data = data.dropna()

In [7]:
after_rows = data.shape[0]

<p style="font-family: Arial; font-size:1.75em;color:purple; font-style:bold"><br>

How many rows dropped due to cleaning<br><br></p>


In [8]:
before_rows - after_rows

31

<p style="font-family: Arial; font-size:1.75em;color:purple; font-style:bold"><br>

Convert to a classification task <br><br>
Binarize the relative_humidity_3pm to 0 or 1<br><br></p>


In [9]:
data['label'] = (data.relative_humidity_3pm > 24.99)*1

<p style="font-family: Arial; font-size:1.75em;color:purple; font-style:bold"><br>

target is stored in 'y'
<br><br></p>


In [10]:
y=data[['label']].copy(deep=True)

In [11]:
data['relative_humidity_3pm'].head()

0    36.160000
1    19.426597
2    14.460000
3    12.742547
4    76.740000
Name: relative_humidity_3pm, dtype: float64

In [12]:
y.head()

Unnamed: 0,label
0,1
1,0
2,0
3,0
4,1


<p style="font-family: Arial; font-size:1.75em;color:purple; font-style:bold"><br>

Use 9am sensor signals as features to predict humidity at 3pm
<br><br></p>


In [25]:
features = ['air_pressure_9am','air_temp_9am','avg_wind_direction_9am','avg_wind_speed_9am',
        'max_wind_direction_9am','max_wind_speed_9am','rain_accumulation_9am',
        'rain_duration_9am']

In [26]:
X = data[features].copy(deep=True)

In [27]:
X.columns

Index(['air_pressure_9am', 'air_temp_9am', 'avg_wind_direction_9am',
       'avg_wind_speed_9am', 'max_wind_direction_9am', 'max_wind_speed_9am',
       'rain_accumulation_9am', 'rain_duration_9am'],
      dtype='object')

In [28]:
y.columns

Index(['label'], dtype='object')

<p style="font-family: Arial; font-size:1.75em;color:purple; font-style:bold"><br>

Perform Test and Train split

<br><br></p>


In [64]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=324)

<p style="font-family: Arial; font-size:1.75em;color:purple; font-style:bold"><br>

Fit on train set
<br><br></p>


In [65]:
estimator = DecisionTreeClassifier(max_leaf_nodes=10, random_state=0)
estimator.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=10, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

<p style="font-family: Arial; font-size:1.75em;color:purple; font-style:bold"><br>

Predict on test set 

<br><br></p>


In [66]:
predictions = estimator.predict(X_test)

In [67]:
predictions[:10]

array([0, 0, 1, 1, 1, 1, 0, 0, 0, 1])

In [68]:
y_test['label'][:10]

456     0
845     0
693     1
259     1
723     1
224     1
300     1
442     0
585     1
1057    1
Name: label, dtype: int64

<p style="font-family: Arial; font-size:1.75em;color:purple; font-style:bold"><br>

Measure accuracy of the classifier
<br><br></p>


In [70]:
accuracy_score(y_true = y_test, y_pred = predictions)

0.81534090909090906