# Load libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def process_df(df):
    df['year'] = pd.DatetimeIndex(df['time']).year
    df['month'] = pd.DatetimeIndex(df['time']).month
    df = df[['month','sbe_temp_avg','aggregate']]
    df.columns = ['month','temp','flag']
    #df['day'] = pd.DatetimeIndex(df['time']).day
    #df = df[['month','day','sbe_temp_avg','aggregate']]
    #df.columns = ['month','day','temp','flag']
    #df = df.loc[(df['flag']==1) | (df['flag']==4)]
    #df.loc[df['flag'] == 4,'flag'] = 0
    df = df.round(2)
    return df

# Load and Process dataset

In [2]:
df2019 = pd.read_csv('MaceHeadTemp2019.csv')
df2019 = process_df(df2019)
df2020 = pd.read_csv('MaceHeadTemp2020.csv')
df2020 = process_df(df2020)

In [3]:
df2019.head(10)

Unnamed: 0,month,temp,flag
0,1,9.68,3.0
1,1,9.67,3.0
2,1,9.66,3.0
3,1,9.66,3.0
4,1,9.66,3.0
5,1,9.66,3.0
6,1,9.67,3.0
7,1,9.67,3.0
8,1,9.67,3.0
9,1,9.67,3.0


In [4]:
df2020.head(10)

Unnamed: 0,month,temp,flag
0,1,8.76,3.0
1,1,8.76,3.0
2,1,8.76,3.0
3,1,8.76,3.0
4,1,8.76,3.0
5,1,8.76,3.0
6,1,8.76,3.0
7,1,8.75,3.0
8,1,8.76,3.0
9,1,8.78,3.0


# Train-Test split 

In [5]:
X_train = df2019[['month','temp']]
y_train = df2019[['flag']]
X_test = df2020[['month','temp']]
y_test = df2020[['flag']]

# K-Nearest Neighbours

In [6]:
neigh = KNeighborsClassifier()
neigh.fit(X_train, y_train.values.ravel())
y_pred_knn = neigh.predict(X_test)
score = accuracy_score(y_test, y_pred_knn)
print("Model Accuracy:",round((score*100),2))

Model Accuracy: 97.73


In [7]:
df2020['flag_pred']=y_pred_knn
df2020[df2020['flag']==3]

Unnamed: 0,month,temp,flag,flag_pred
0,1,8.76,3.0,3.0
1,1,8.76,3.0,3.0
2,1,8.76,3.0,3.0
3,1,8.76,3.0,3.0
4,1,8.76,3.0,3.0
...,...,...,...,...
12432,10,13.68,3.0,1.0
12483,10,13.51,3.0,1.0
12666,10,12.61,3.0,1.0
14299,11,11.00,3.0,1.0


# Random Forest

In [8]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train.values.ravel())
y_pred_rf = rf.predict(X_test)
score = accuracy_score(y_test, y_pred_rf)
print("Model Accuracy:",round((score*100),2))

Model Accuracy: 99.65


In [9]:
df2020['flag_pred']=y_pred_knn
df2020[df2020['flag']==3]

Unnamed: 0,month,temp,flag,flag_pred
0,1,8.76,3.0,3.0
1,1,8.76,3.0,3.0
2,1,8.76,3.0,3.0
3,1,8.76,3.0,3.0
4,1,8.76,3.0,3.0
...,...,...,...,...
12432,10,13.68,3.0,1.0
12483,10,13.51,3.0,1.0
12666,10,12.61,3.0,1.0
14299,11,11.00,3.0,1.0
