In [26]:
%load_ext autoreload
%autoreload 2
import data_loader

from feature_extraction import extract_features, data_in_tw
#from datetime import datetime, timedelta
import pandas as pd
import pytz
import pickle

from sklearn import svm
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
# Size of time window in unit of second.
TIME_WINDOW_SEC = 30
features = []
labels = []

In [7]:
dl = data_loader.DataLoader()

# Load Groudtruth
df_labels = dl.load_groundtruth()


# Load low frequency data
df_location = dl.load_low_freq_data('LOCATION')
df_activity = dl.load_low_freq_data('ACTIVITY_TYPE')

In [13]:
visited = set()
for index, row in df_labels.iterrows():
    d = row['Start Timestamp'].date()
    if d not in visited:
        # load high frequency data
        df_battery = dl.load_high_freq_data('BATTERY', d)
        df_compass = dl.load_high_freq_data('COMPASS', d)
        visited.add(d)

    start = row['Start Timestamp'] # + timedelta(seconds=120)
    end = row['End Timestamp'] # - timedelta(seconds=120)

    if start >= end:
        continue

    # Obtain data needed for one label(one event)
    streams = {}
    streams['Battery'] = data_in_tw(df_battery, start, end)
    streams['Compass'] = data_in_tw(df_compass, start, end)
    streams['Activity'] = data_in_tw(df_activity, start, end)
    streams['Location'] = data_in_tw(df_location, start, end)

    # Segment data streams into time windows, extract features and append features
    tmp = extract_features(streams, start, end, TIME_WINDOW_SEC)
    features = features + tmp
    
    # Extract labels and append labels
    labels = labels + [row['Labels']] * len(tmp)

In [19]:
print(len(features))
print(len(labels))

26054
26054


In [30]:
# Data preprocessing

# Convert features and labels to pandas dataframe format
df_features = pd.DataFrame(features)
df_features['Labels'] = pd.DataFrame(labels)

# df_features.to_csv('zhengxux_features_labels.csv')

df_features['Battery Charging'].interpolate(method='nearest', inplace=True)
df_features['Max GPS Accuracy'].fillna(500, inplace=True)
df_features = df_features.dropna(axis=0)
df_labels = df_features['Labels']
df_features.drop('Labels', axis=1, inplace=True)

# train_features is numpy array
vec = DictVectorizer()
train_features = vec.fit_transform(df_features.to_dict('records')).toarray()

# Normalize columns of features with min-max scaler
scaler = MinMaxScaler(copy=False)
train_features = scaler.fit_transform(train_features)

# Encode all features
le = LabelEncoder()
le.fit(['Incar', 'Indoor', 'Outdoor'])
train_labels = le.transform(df_labels)

In [25]:
X_train, X_test, y_train, y_test = train_test_split(train_features, train_labels, test_size=0.20, random_state=42)
clf = svm.SVC()
clf.fit(X_train, y_train)
predicted_y = clf.predict(X_test)
confusionmatrix = confusion_matrix(y_test, predicted_y)
print(confusionmatrix)
print(accuracy_score(y_test, predicted_y, normalize=True))
print(f1_score(y_test, predicted_y, average='micro'))
print(f1_score(y_test, predicted_y, average='macro'))
print(f1_score(y_test, predicted_y, average='weighted'))

[[  48   58    4]
 [   2 4016   73]
 [   0  109  113]]
0.94438164142
0.94438164142
0.706431813933
0.940339615287


In [29]:
# save the model to disk
filename = 'zhengxuxia_model.clf'
pickle.dump(clf, open(filename, 'wb'))