Import required library and also load the dataset

In [None]:
import pandas as pd 
import numpy as np
import seaborn as sns

df = pd.read_csv("dailySteps_merged.csv")

Seperate Steps count from other datas

In [None]:
df = df.pivot(index ='Id', columns ='ActivityDay', values =['StepTotal']) 
df.to_csv('first_table.csv')

Rename columns headers to Day%%

In [None]:
df = pd.read_csv('first_table.csv',index_col=0)
i = 1
for col in df.columns:
    df.rename(columns={col:'Day'+str(i)},inplace=True)
    i= i +1
df.head()

Delete ActivityDay and ID row and also drop rows with more than 10 Null values

In [None]:
df = df.drop(index=['ActivityDay','Id'])
df = df.loc[df.isnull().sum(axis=1) <10]

In [None]:
# Check for no. of null values in a row
df.isnull().sum(axis=1)

Interpolate the remaining NaN values linearly

In [None]:
j = 1
for col in df.columns:
    df['Day'+ str(j)] = pd.to_numeric(df['Day'+str(j)], errors='coerce')
    j = j + 1
df = df.interpolate(method ='linear', limit_direction ='forward', axis = 1)

In [None]:
#Change all the step counts to integer value
k = 1
for col in df.columns:
    df['Day'+str(k)] = df['Day'+str(k)].astype(np.int64)
    k = k + 1

In [None]:
df

Since the dataset we are using is very small, we shall use permutation to create a synthetic dataset

In [None]:
from itertools import permutations

c_count = -1
for col in df.columns:
    c_count = c_count + 1 
    row_values = []
    count = 0
    for index in df.index:
        if(count==2):
            break
        row_values.append(df.at[index,col])
        count = count+1
    perm = permutations([row_values[0], row_values[1]])
    for i in perm:
        for x in i:
            t_count = 0
            df2 = []
            for columns in df.columns:
                if(c_count == t_count):
                    df2.append(x)
                    t_count = t_count + 1
                else:
                    df2.append(df[columns].iloc[0])
                    t_count = t_count + 1
            df_length = len(df)
            df.loc[df_length] = df2

In [None]:
df.to_csv('initial_table.csv')

In [None]:
df = pd.read_csv('initial_table.csv',index_col=0)
df

Create a table with columns: 'Initial Steps','Average Steps', 'Last Week Steps', 'Day Number','DiPS'

In [None]:
final = pd.DataFrame({'Initial Steps':[],'Average Steps':[], 'Last Week Steps':[], 'Day Number':[],'DiPS':[]})
t_int = 0
t_lws = 0
t_avg = 0
t_dno = 0
t_dips = -1
for index in df.index:
    for col in df.columns:
        if(col=='Day1'):
            t_int = df.at[index,col]
            continue
        elif(col=='Day31'):
            continue
        else:
            t_lws = df.at[index,col]
            t_avg = abs((t_int + t_lws)/2)
            t_dno = int(col[3:])
            if(df.at[index,'Day'+str((int(col[3:])+1))]>t_int):
                t_dips = 100
            else:
                t_dips = 0
            final = final.append({'Initial Steps':t_int,'Average Steps':t_avg, 'Last Week Steps':t_lws,'Day Number':t_dno,'DiPS': t_dips}, ignore_index=True)

final.drop_duplicates()
final

In [None]:
final.to_csv('dataset.csv', index=False)

# Classification
As our dataset is now ready we shall use Ensemble Classifier algorithms to train the model

Since there are various different models, we shall try all of them one by one.

## Bagging Algorithms

### 1. Bagged Decision Trees

In [None]:
from sklearn import model_selection
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

#Load the dataset and assign X and Y
dataset = pd.read_csv('dataset.csv')
Y = dataset.DiPS
features = ['Initial Steps','Average Steps','Last Week Steps','Day Number']
X = dataset[features]

#Initialize model and train it.
kfold = model_selection.KFold(n_splits=10)
cart = DecisionTreeClassifier()
b_model = BaggingClassifier(base_estimator=cart, n_estimators=100, random_state=0)

# Find the mean prediction rate
b_results = model_selection.cross_val_score(b_model, X, Y, cv=kfold)
print(b_results.mean())


### 2. Random Forest

In [None]:
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier

#Load the dataset and assign X and Y
dataset = pd.read_csv('dataset.csv')
Y = dataset.DiPS
features = ['Initial Steps','Average Steps','Last Week Steps','Day Number']
X = dataset[features]

#Initialize model and train it.
kfold = model_selection.KFold(n_splits=10)
r_model = RandomForestClassifier(n_estimators=100, max_features=4)

# Find the mean prediction rate
r_results = model_selection.cross_val_score(r_model, X, Y, cv=kfold)
print(r_results.mean())


### 3. Extra Trees

In [None]:
from sklearn import model_selection
from sklearn.ensemble import ExtraTreesClassifier

#Load the dataset and assign X and Y
dataset = pd.read_csv('dataset.csv')
Y = dataset.DiPS
features = ['Initial Steps','Average Steps','Last Week Steps','Day Number']
X = dataset[features]

#Initialize model and train it.
kfold = model_selection.KFold(n_splits=10)
e_model = ExtraTreesClassifier(n_estimators=100, max_features=4)

# Find the mean prediction rate
e_results = model_selection.cross_val_score(e_model, X, Y, cv=kfold)
print(e_results.mean())


## Boosting Algorithms

### 1. AdaBoost

In [None]:
from sklearn import model_selection
from sklearn.ensemble import AdaBoostClassifier

#Load the dataset and assign X and Y
dataset = pd.read_csv('dataset.csv')
Y = dataset.DiPS
features = ['Initial Steps','Average Steps','Last Week Steps','Day Number']
X = dataset[features]

#Initialize model and train it.
kfold = model_selection.KFold(n_splits=10)
ab_model = AdaBoostClassifier(n_estimators=100, random_state=0)

# Find the mean prediction rate
ab_results = model_selection.cross_val_score(ab_model, X, Y, cv=kfold)
print(ab_results.mean())


### 2. Stochastic Gradient Boosting

In [None]:
from sklearn import model_selection
from sklearn.ensemble import GradientBoostingClassifier

#Load the dataset and assign X and Y
dataset = pd.read_csv('dataset.csv')
Y = dataset.DiPS
features = ['Initial Steps','Average Steps','Last Week Steps','Day Number']
X = dataset[features]

#Initialize model and train it.
kfold = model_selection.KFold(n_splits=10)
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=0)

# Find the mean prediction rate
gb_results = model_selection.cross_val_score(gb_model, X, Y, cv=kfold)
print(gb_results.mean())

## Voting Ensemble

In [None]:
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

#Load the dataset and assign X and Y
dataset = pd.read_csv('dataset.csv')
Y = dataset.DiPS
features = ['Initial Steps','Average Steps','Last Week Steps','Day Number']
X = dataset[features]

kfold = model_selection.KFold(n_splits=10)

# Create the sub models
estimators = []
model1 = LogisticRegression()
estimators.append(('logistic', model1))
model2 = DecisionTreeClassifier()
estimators.append(('cart', model2))
model3 = SVC()
estimators.append(('svm', model3))

# Create the ensemble model
ensemble = VotingClassifier(estimators)

# Find the mean prediction rate
ve_results = model_selection.cross_val_score(ensemble, X, Y, cv=kfold)
print(ve_results.mean())

Since we have tried various different ensemble classifiers, let us summarize the prediction accuracy of each model.

## Summary

In [None]:
print("Mean Prediction Accuracy of various models:")
print("\nBagging Algorithms")
print("1. Bagged Decision Trees: ",b_results.mean())
print("2. Random Forest: ",r_results.mean())
print("3. Extra Trees: ",e_results.mean())
print('\nBoosting Algorithms')
print("1. AdaBoost: ",ab_results.mean())
print("2. Stochastic Gradient Boosting ",gb_results.mean())
print('\nVoting Ensemble:',ve_results.mean())