In [2]:
# import the modules we will need
import numpy as np
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier

In [5]:
# Load the data and examine what it looks like
df = pd.read_csv("dataset.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,date,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy
0,0,2015-02-02 14:19:00,23.7,26.272,585.2,749.2,0.004764,1
1,1,2015-02-02 14:19:59,23.718,26.29,578.4,760.4,0.004773,1
2,2,2015-02-02 14:21:00,23.73,26.23,572.666667,769.666667,0.004765,1
3,3,2015-02-02 14:22:00,23.7225,26.125,493.75,774.75,0.004744,1
4,4,2015-02-02 14:23:00,23.754,26.2,488.6,779.0,0.004767,1


In [6]:
"""
We want to predict whether someone is in the room - that is, we want to predict whether the value for "Occupancy" 
is a 0 or a 1 (the 'Y' value) given the other values (the 'X' values)

First, we must check to see if the classes are balanced - if they are, we can use a standard classification algorithm
"""
df.Occupancy.value_counts()

0    15810
1     4750
Name: Occupancy, dtype: int64

In [7]:
"""
The classes are balanced! That means we can proceed to train our algorithm.

This block demonstrates how a train-test split is performed.
Note that I do not draw random samples; a normal train-test split would split the data randomly. 
However, it is easier simply to use the 'train_test_split()' function from scikit learn, so in your own project, I 
would recommend just using that function.
"""



#set the proportion htat you want to be the testing set
split = 0.2
train_length = int(len(df)*split)

X_train = np.array(df.iloc[:train_length+1,3:7])
y_train = np.array(df.loc[:train_length,"Occupancy"])

X_test = np.array(df.iloc[train_length:,3:7])
y_test = np.array(df.loc[train_length:,"Occupancy"])


In [8]:
"""
Here, I try one of my personal favorites - the Decision Tree classifier. Start by creating and fitting the algorithm
using the training set
"""

tree = DecisionTreeClassifier()
tree.fit(X_train,y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [5]:
# Now use "score" with the testing set to examine the algorithm's accuracy

tree.score(X_test,y_test)

0.9117488110678772

In [9]:
"""
Not bad! But we can do better. Let's try what sci-kit learn recommends in their infographic - 
the Linear Support Vector Machine. Ignore the convergence warnings; the algorithm will still work just fine.
"""

model = LinearSVC()
model.fit(X_train, y_train)
### NOTE: Do not worry about convergence warnings; the model can still predict with reasonable accuracy ###



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [10]:
# Looks like we can get a higher accuracy using this algorithm

model.score(X_test, y_test)

0.9613934824902723