# Zelp Interview Assignment(ML Interview Task)
## Sohyun Park

In [18]:
import pandas as pd
import numpy as np
import datetime
from sklearn.model_selection import train_test_split

## 1. Preparing Data

1) Load Data

In [19]:
def dataPrep(filename):
    data = pd.read_csv(filename)
    data['datetime'] = pd.to_datetime(data.datetime)
    
    seconds = []

    for i in range(len(data)):
        t = data['datetime'][i].time()
        s = int(datetime.timedelta(hours=t.hour,minutes=t.minute,seconds=t.second).total_seconds())
        seconds.append(s)

    data['seconds'] = seconds
    
    return data

2) Check Data

In [21]:
data = dataPrep('train_imu.csv')
data.dtypes

Unnamed: 0     int64
Ax             int64
Ay             int64
Az             int64
Gx             int64
Gy             int64
Gz             int64
datetime      object
labels         int64
seconds        int64
dtype: object

3) Preprocess Data for Training and Testing

In [22]:
X_all = data.drop(['labels','datetime'], axis = 1)
y_all = data['labels']

num_test = 0.10

X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=num_test, random_state=29)

## 2. Applying Algorithms

In [23]:
import warnings
warnings.filterwarnings("ignore")

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import make_scorer, accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV, KFold

In [25]:
def run_kfold(model, X_all, y_all):
    
    kf = KFold(n_splits = 10, shuffle = True)
    outcomes = []
    
    fold = 0
    
    for train_index, test_index in kf.split(X_all):
        
        fold += 1
        
        X_train, X_test = X_all.values[train_index], X_all.values[test_index]
        y_train, y_test = y_all.values[train_index], y_all.values[test_index]
        
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        accuracy = f1_score(y_test, predictions, average='micro')
        outcomes.append(accuracy)
        
    mean_outcome = np.mean(outcomes)
    
    print("Cross-Validation Score: %s" % "{0:.3%}".format(mean_outcome))
    
def classification_model(model, X_train, y_train, X_test, y_test):
    
    model.fit(X_train, y_train)   
    predictions = model.predict(X_test)
    accuracy = f1_score(y_test, predictions, average='micro')
    
    print("Accuracy: %s" % "{0:.3%}".format(accuracy))
    
    run_kfold(model, X_all, y_all)

**1) LogisticRegression**

In [26]:
model = LogisticRegression(solver='lbfgs',multi_class='ovr')
classification_model(model, X_train, y_train, X_test, y_test)

KeyboardInterrupt: 

**2) Naive Bayes**

In [7]:
model = GaussianNB()
classification_model(model, X_train, y_train, X_test, y_test)

Accuracy: 57.970%
Cross-Validation Score: 58.027%


**3) Decision Tree**

In [8]:
model = DecisionTreeClassifier()
classification_model(model, X_train, y_train, X_test, y_test)

Accuracy: 95.868%
Cross-Validation Score: 96.148%


**4) Random Forest**

In [6]:
model = RandomForestClassifier()
classification_model(model, X_train, y_train, X_test, y_test)



Accuracy: 92.870%
Cross-Validation Score: 92.745%


**5) MLP**

In [9]:
model = MLPClassifier()
classification_model(model, X_train, y_train, X_test, y_test)

Accuracy: 73.839%
Cross-Validation Score: 74.240%


In [11]:
model = MLPClassifier(hidden_layer_sizes=10)
classification_model(model, X_train, y_train, X_test, y_test)

Accuracy: 73.839%
Cross-Validation Score: 68.948%


## 3. Choosing One Algorithm for the Dataset

In [34]:
def finalClassificationModel(model, X_train, y_train, X_test, y_test):
    
    #parameters = {
    #    'max_features': ['log2', 'sqrt','auto'], 
    #    'criterion': ['entropy', 'gini'],
    #    'max_depth': [2, 3, 5, 10, None]
    #         }
    #accuracy = make_scorer(accuracy_score)
    #grid_obj = GridSearchCV(model, parameters, scoring=accuracy)
    #grid_obj = grid_obj.fit(X_train, y_train)
    
    #model = grid_obj.best_estimator_
    
    model.fit(X_train, y_train)   
    predictions = model.predict(X_test)
    accuracy = f1_score(y_test, predictions, average='micro')
    
    print("Accuracy: %s" % "{0:.3%}".format(accuracy))
    
    run_kfold(model, X_all, y_all)
    
    return model

In [35]:
model = DecisionTreeClassifier()
trained_model = finalClassificationModel(model, X_train, y_train, X_test, y_test)

Accuracy: 95.865%
Cross-Validation Score: 96.185%


## 4. Test Data Output(For testing)

In [None]:
test_data = dataPreprop(testDataFile)

timestamp = test_data['datetime']
predictions = trained_model.predict(test_data.drop(['datetime'], axis = 1))

output = pd.DataFrame({ 'datetime' : timestamp, 'labels': predictions })
output.to_csv('final_result.csv', index = False)

#str(datetime.timedelta(seconds=seconds))

## 5. Conclusion

1. Date and time variables

I converted the data type of date and time to int for using date and time(especially time) as a feature of model training. I checekd out that with date and time feature, compared to its absence, the accuracy increased by about 10 percent.

2. Algorithms for the dataset

I compared about 5 ML algorithms for the task. And found out Decision Tree is the best classifier among those algorithms.

3. Parameters for a Decision Tree classifier

I applied various hyperparameters to a Decision Tree model, but it turned out just basic a Decision Tree model provided by Scikit Learn is the best status for the given dataset.