# Ingest Data

In [1]:
import pandas as pd

In [2]:
# Read in the csv file exported from Django backend into a dataframe
data = pd.read_csv("new_format_data_cowdy.csv", sep=',', header=0)

# Preprocess data

In [3]:
from Functions import pre_process_data

In [4]:
df = pre_process_data(data)

In [5]:
df.head()

Unnamed: 0,device_id,time,x,y,z
0,9729,2019-11-14 23:59:30,0.019531,0.0,-0.988281
1,272B,2019-11-14 23:59:30,0.039062,0.0,-1.019531
2,262B,2019-11-14 23:59:30,0.027344,0.0,0.96875
3,2B2B,2019-11-14 23:59:30,0.019531,0.0,-1.0
4,272B,2019-11-14 23:59:31,0.027344,0.0,-1.007812


In [6]:
len(df)

50000

# Remove duplicates

In [7]:
from Functions import remove_duplicates

In [8]:
# subset for one animal
df = df[df['device_id']=='262B']

In [9]:
len(df)

12441

In [10]:
df = remove_duplicates(df)

In [11]:
len(df)

12432

In [12]:
# drop column device_id
df = df.drop(columns = ['device_id'])

In [13]:
df.head()

Unnamed: 0,time,x,y,z
0,2019-11-14 23:59:30,0.027344,0.0,0.96875
1,2019-11-14 23:59:31,0.019531,0.0,0.957031
2,2019-11-14 23:59:33,0.027344,-0.007812,0.957031
3,2019-11-14 23:59:34,0.027344,-0.007812,0.957031
4,2019-11-14 23:59:35,0.027344,-0.007812,0.96875


# Smoothing Noise

In [14]:
from Functions import smooth_noise

In [15]:
# Rolling window over 5 seconds
df = smooth_noise(df, window_size=5)

In [16]:
len(df)

12428

# Calculate differential values

In [17]:
from Functions import calculate_differential_values

In [18]:
dataframe = calculate_differential_values(df)

In [19]:
dataframe.head()

Unnamed: 0_level_0,x_diff,y_diff,z_diff,sum_diff
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-11-14 23:59:39,-0.001562,-0.001563,-0.002344,0.005469
2019-11-14 23:59:40,0.0,0.0,0.002344,0.002344
2019-11-14 23:59:42,0.0,0.001563,0.0,0.001563
2019-11-14 23:59:44,0.0,0.0,0.0,0.0
2019-11-14 23:59:45,0.0,0.0,-0.002344,0.002344


# Clustering by K-means

In [20]:
from Functions import clustering_function

In [21]:
km_df = clustering_function(dataframe, number_of_clusters=3, init = "random", n_init=20)

In [22]:
km_df.head()

Unnamed: 0,x_diff,y_diff,z_diff,sum_diff,Clus_km
0,-0.001562,-0.001563,-0.002344,0.005469,2
1,0.0,0.0,0.002344,0.002344,0
2,0.0,0.001563,0.0,0.001563,1
3,0.0,0.0,0.0,0.0,1
4,0.0,0.0,-0.002344,0.002344,2


# Train SVC

In [23]:
from Functions import train_svc

In [26]:
# Train and export SVC
train_and_export_svc(km_df, output_filename="/svm_for_cow.pkl")

# Preprocess, Subset, Remove Duplicates, Predict using the SVC model, Calculate Activity Level, and Calculate Activity Index

#### Preprocess, Subset and Remove Duplicates

In [28]:
from Functions import *

In [29]:
# raw data
data.head()

Unnamed: 0,id,data,created_at,last_modified
0,2367349,"b'9729,00050000FF03,64,2019-11-14T17:59:30Z,2a*'",2019-11-15 00:00:00.142968,2019-11-15 00:00:00.142992
1,2367350,"b'272B,000A0000FEFB,64,2019-11-14T17:59:30Z,2a*'",2019-11-15 00:00:00.294080,2019-11-15 00:00:00.294104
2,2367351,"b'262B,0007000000F8,64,2019-11-14T17:59:30Z,2a*'",2019-11-15 00:00:01.070546,2019-11-15 00:00:01.070574
3,2367352,"b'2B2B,00050000FF00,64,2019-11-14T17:59:30Z,2a*'",2019-11-15 00:00:01.221148,2019-11-15 00:00:01.221172
4,2367353,"b'272B,00070000FEFE,64,2019-11-14T17:59:31Z,2a*'",2019-11-15 00:00:01.373052,2019-11-15 00:00:01.373076


In [30]:
df = pre_process_data(data)

In [31]:
# subset for one animal
df = df[df['device_id']=='262B']

In [32]:
# drop column device_id
df = df.drop(columns = ['device_id'])

In [33]:
df = remove_duplicates(df)

In [34]:
df.head(3)

Unnamed: 0,time,x,y,z
0,2019-11-14 23:59:30,0.027344,0.0,0.96875
1,2019-11-14 23:59:31,0.019531,0.0,0.957031
2,2019-11-14 23:59:33,0.027344,-0.007812,0.957031


#### Predict using the SVC model

In [35]:
from Functions import predict_using_svc

In [43]:
cow = predict_using_svc(df, classifier_filepath="svm_for_cow.pkl")

In [44]:
cow.head(3)

Unnamed: 0_level_0,x_diff,y_diff,z_diff,sum_diff,labels
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-11-14 23:59:31,-0.007812,0.0,-0.011719,0.019531,2
2019-11-14 23:59:33,0.007812,-0.007812,0.0,0.015625,1
2019-11-14 23:59:34,0.0,0.0,0.0,0.0,1


#### Calculate Activity Level

In [45]:
# as a precaution, in case there is timestamp as index, we should reset it 
# so that timestamp information becomes a column again
try:
    # reset the time index to column 
    cow.reset_index(drop=False, inplace=True) 
except:
    pass

In [46]:
cow.head()

Unnamed: 0,time,x_diff,y_diff,z_diff,sum_diff,labels
0,2019-11-14 23:59:31,-0.007812,0.0,-0.011719,0.019531,2
1,2019-11-14 23:59:33,0.007812,-0.007812,0.0,0.015625,1
2,2019-11-14 23:59:34,0.0,0.0,0.0,0.0,1
3,2019-11-14 23:59:35,0.0,0.0,0.011719,0.011719,0
4,2019-11-14 23:59:39,-0.007812,0.0,-0.011719,0.019531,2


In [48]:
cow_modified = calculate_activity_level(cow)

In [49]:
cow_modified.head()

Unnamed: 0,time,activity_level,activity_level_1,activity_level_24,activity_level_48,activity_level_72
0,2019-11-14 23:59:31,1757.7,,,,
1,2019-11-15 00:59:31,0.0,1757.7,,,
2,2019-11-15 01:59:31,28.2,0.0,,,
3,2019-11-15 02:59:31,0.0,28.2,,,
4,2019-11-15 03:59:31,1735.2,0.0,,,


#### Calculate Activity Index

In [50]:
from Functions import calculate_activity_index

In [51]:
activity_index_df = calculate_activity_index(cow_modified)

  trend = (activity_df.loc[i, 'activity_level'] - activity_df.loc[i, 'activity_level_1'])/activity_df.loc[i, 'activity_level_1']


In [52]:
activity_index_df.head()

Unnamed: 0,time,activity_index
0,2019-11-14 23:59:31,
1,2019-11-15 00:59:31,
2,2019-11-15 01:59:31,
3,2019-11-15 02:59:31,
4,2019-11-15 03:59:31,


# Checking the time difference

### Ingest Data

In [1]:
import pandas as pd

In [2]:
# Read in the csv file exported from Django backend into a dataframe
data = pd.read_csv("new_format_data_cowdy.csv", sep=',', header=0)

### Preprocess data

In [3]:
from Functions import pre_process_data

In [4]:
df = pre_process_data(data)

In [5]:
df.head()

Unnamed: 0,device_id,time,x,y,z
0,9729,2019-11-14 23:59:30,0.019531,0.0,-0.988281
1,272B,2019-11-14 23:59:30,0.039062,0.0,-1.019531
2,262B,2019-11-14 23:59:30,0.027344,0.0,0.96875
3,2B2B,2019-11-14 23:59:30,0.019531,0.0,-1.0
4,272B,2019-11-14 23:59:31,0.027344,0.0,-1.007812


In [6]:
len(df)

50000

### Remove duplicates

In [7]:
from Functions import remove_duplicates

In [8]:
# subset for one animal
df = df[df['device_id']=='262B']

In [9]:
len(df)

12441

In [10]:
df = remove_duplicates(df)

In [11]:
len(df)

12432

In [12]:
# drop column device_id
df = df.drop(columns = ['device_id'])

In [13]:
df.head()

Unnamed: 0,time,x,y,z
0,2019-11-14 23:59:30,0.027344,0.0,0.96875
1,2019-11-14 23:59:31,0.019531,0.0,0.957031
2,2019-11-14 23:59:33,0.027344,-0.007812,0.957031
3,2019-11-14 23:59:34,0.027344,-0.007812,0.957031
4,2019-11-14 23:59:35,0.027344,-0.007812,0.96875


In [28]:
df.loc[1, 'time'] - df.loc[0, 'time']

Timedelta('0 days 00:00:01')

### Check for the time differences

In [30]:
from datetime import timedelta

In [36]:
df2 = []
mismatch = []

In [37]:
for i in range(len(df)):
    if (i == 0):
        df2.append(df.loc[i, :])
    else:
        try:
            if (df.loc[i, 'time'] - df.loc[i-1, 'time'] == timedelta(minutes=1)):
                df2.append(df.loc[i, :])
            else:
                mismatch.append(df.loc[i, :])
        except:
            pass
            

In [33]:
df2 = pd.DataFrame(df2)

In [34]:
df2.head()

In [38]:
mismatch = pd.DataFrame(mismatch)
mismatch.head()

Unnamed: 0,time,x,y,z
1,2019-11-14 23:59:31,0.019531,0.0,0.957031
2,2019-11-14 23:59:33,0.027344,-0.007812,0.957031
3,2019-11-14 23:59:34,0.027344,-0.007812,0.957031
4,2019-11-14 23:59:35,0.027344,-0.007812,0.96875
5,2019-11-14 23:59:39,0.019531,-0.007812,0.957031
