In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.datasets import load_files
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

import csv

## Loading Dataset 

In [2]:
dataset_beijing  = pd.read_csv('FiveCitiePMData\BeijingPM20100101_20151231.csv')

In [3]:
dataset_beijing.head()

Unnamed: 0,No,year,month,day,hour,season,PM_Dongsi,PM_Dongsihuan,PM_Nongzhanguan,PM_US Post,DEWP,HUMI,PRES,TEMP,cbwd,Iws,precipitation,Iprec
0,1,2010,1,1,0,4,,,,,-21.0,43.0,1021.0,-11.0,NW,1.79,0.0,0.0
1,2,2010,1,1,1,4,,,,,-21.0,47.0,1020.0,-12.0,NW,4.92,0.0,0.0
2,3,2010,1,1,2,4,,,,,-21.0,43.0,1019.0,-11.0,NW,6.71,0.0,0.0
3,4,2010,1,1,3,4,,,,,-21.0,55.0,1019.0,-14.0,NW,9.84,0.0,0.0
4,5,2010,1,1,4,4,,,,,-20.0,51.0,1018.0,-12.0,NW,12.97,0.0,0.0


## Data Cleaning 

In [4]:
dataset_beijing.drop(['PM_Dongsi', 'PM_Dongsihuan', 'PM_Nongzhanguan'],
                    axis=1,
                    inplace=True)

Note for parameter:

axis : {0 or ‘index’, 1 or ‘columns’}, default 0
Whether to drop labels from the index (0 or ‘index’) or columns (1 or ‘columns’).

inplace : bool, default False
If True, do operation inplace and return None.

In [5]:
dataset_beijing.dropna(axis=0, how='any', inplace=True)

Note for parameter:

how : {‘any’, ‘all’}, default ‘any’
Determine if row or column is removed from DataFrame, when we have at least one NA or all NA.

‘any’ : If any NA values are present, drop that row or column.
‘all’ : If all values are NA, drop that row or column.

In [226]:
g = []

In [227]:
for i in range(49579):
    g.append(dataset_beijing.index.values[i])

In [228]:
labelEncoder = LabelEncoder()
dataset_beijing['cbwd'] = labelEncoder.fit_transform(dataset_beijing['cbwd'])

LabelEncoder:
Encode labels with value between 0 and n_classes-1.

Label Encoder vs. One Hot Encoder in Machine Learning:
https://medium.com/@contactsunny/label-encoder-vs-one-hot-encoder-in-machine-learning-3fc273365621

## Labels for pollution

In [229]:
dataset_beijing["Result"] = 0

In [230]:
dataset_beijing.head()

Unnamed: 0,No,year,month,day,hour,season,PM_US Post,DEWP,HUMI,PRES,TEMP,cbwd,Iws,precipitation,Iprec,Result
23,24,2010,1,1,23,4,129.0,-17.0,41.0,1020.0,-5.0,3,0.89,0.0,0.0,0
24,25,2010,1,2,0,4,148.0,-16.0,38.0,1020.0,-4.0,2,1.79,0.0,0.0,0
25,26,2010,1,2,1,4,159.0,-15.0,42.0,1020.0,-4.0,2,2.68,0.0,0.0,0
26,27,2010,1,2,2,4,181.0,-11.0,63.5,1021.0,-5.0,2,3.57,0.0,0.0,0
27,28,2010,1,2,3,4,138.0,-7.0,85.0,1022.0,-5.0,2,5.36,0.0,0.0,0


In [231]:
for i in range(49579):
    if dataset_beijing["PM_US Post"][g[i]] > 75:
        dataset_beijing["Result"][g[i]] = 1        

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [242]:
dataset_beijing2 = dataset_beijing

In [None]:
dataset_beijing2.to_excel('bejing_pollutionlabels.xlsx')

In [8]:
dataset_beijing2 = pd.read_excel('bejing_pollutionlabels.xlsx')

## Scaling and Splitting Data

In [9]:
y = dataset_beijing2['Result']
X = dataset_beijing2.drop('Result', axis=1)

In [10]:
standardScaler=StandardScaler()
X_scaled=standardScaler.fit_transform(X)

Note:

Q)
  Why do data scientists use Sklearn’s StandardScaler and what does it do?

A)  
 It turns out that standardizing your data is much more than having the number of row items in your CSV equal the number of labels (headers) the CSV has. Standardization is a bit more than that, and as the documentation states, “it is a common requirement for many machine learning estimators: they might behave badly if the individual feature do not more or less look like standard normally distributed data…”.

This means that before you start training or predicting on your dataset, you first need to eliminate the “oddballs”. You need to remove values that aren’t centered around 0, because they might throw off the learning your algorithm is doing.

https://medium.com/@oprearocks/why-do-data-scientists-use-sklearns-standardscaler-and-what-does-it-do-9d93e248eb4

In [17]:
print("X_scaled.shape:",X_scaled.shape)
print("y.shape:",y.shape)

X_scaled.shape: (49579, 15)
y.shape: (49579,)


In [18]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.20, random_state = 0)

In [19]:
print("X_train.shape:",X_train.shape)
print("y_train.shape:",y_train.shape)
print("X_test.shape:",X_test.shape)
print("y_test.shape",y_test.shape)

X_train.shape: (39663, 15)
y_train.shape: (39663,)
X_test.shape: (9916, 15)
y_test.shape (9916,)


## Training 

### Logistic Regression

In [20]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(verbose=10, max_iter=100)

In [21]:
model.fit(X_train, y_train)

[LibLinear]

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=10, warm_start=False)

In [22]:
model.score(X_test, y_test)

0.9987898346107301