In [9]:
import gspread
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from google.colab import auth
from google.auth import default

In [10]:
# auth/load google credentials
auth.authenticate_user()
creds, _= default()
gc = gspread.authorize(creds)

#open the Google sheet by its URL (ensure it's publicly accessible)
sheet_url = 'https://docs.google.com/spreadsheets/d/1SLwix8SFx3VUShDSNgh8-uu1h2f-UulPTu2t5CXWRnM/edit'

#Authenticate and open spreadsheet
spreadsheet = gc.open_by_url(sheet_url)

#select a specific worksheet
worksheet = spreadsheet.worksheet("sample-data-fan-control")

#Get all values from the worksheet
values = worksheet.get_all_values()

# Convert to a pandas DataFrame
df = pd.DataFrame(values[1:], columns=values[0])

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 902 entries, 0 to 901
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   time           902 non-null    object
 1   period         902 non-null    object
 2   day            902 non-null    object
 3   studio_motion  902 non-null    object
 4   dog_motion     902 non-null    object
 5   studio_temp    902 non-null    object
 6   dog_temp       902 non-null    object
 7   studio_fan     902 non-null    object
 8   dog_fan        902 non-null    object
dtypes: object(9)
memory usage: 63.5+ KB


In [18]:
!rm -r ~/.config/gspread


rm: cannot remove '/root/.config/gspread': No such file or directory


In [47]:
# clean up data
weekday_map = {'sunday': 0, 'monday': 1, 'tuesday': 2, 'wednesday': 3, 'thursday': 4, 'friday': 5, 'saturday': 6}
df['day'] = df['day'].map(weekday_map)

truth_map = {'FALSE': 0, 'TRUE': 1}
df['studio_motion'] = df['studio_motion'].map(truth_map)
df['dog_motion'] = df['dog_motion'].map(truth_map)

on_off_map = {'off': 0, 'on': 1}
df['studio_fan'] = df['studio_fan'].map(on_off_map)
df['dog_fan'] = df['dog_fan'].map(on_off_map)

# Function to convert time stamp string to seconds
def time_to_seconds(time_str):
    hours, minutes, seconds = map(int, time_str.split(':'))
    return hours * 3600 + minutes * 60 + seconds

# Convert the 'Timestamps' column to seconds in day
df['time'] = df['time'].apply(time_to_seconds)

# -------------------------------------------
# seperate studio fan and dog fan  data
# select studio fan and dog fan state DataFrame
df_st = df[['time','day','studio_motion', 'studio_temp','studio_fan']]
df_do = df[['time','day','dog_motion', 'dog_temp','dog_fan']]

# -------------------------------------------
# seperate features/Inputs (X) and targets (classes: y)

# condition analysis (target data, Class)
y_st = df_st['studio_fan']
y_do = df_do['dog_fan']

# independent variables (Features/Input)
X_st = df_st.drop(['studio_fan'], axis = 1)
X_do = df_do.drop(['dog_fan'], axis = 1)

# -------------------------------------------
# split training and test data

X_train_st, X_test_st, y_train_st, y_test_st = train_test_split(X_st, y_st, test_size=0.20, random_state=91)
X_train_do, X_test_do, y_train_do, y_test_do = train_test_split(X_do, y_do, test_size=0.20, random_state=91)


In [48]:
# init and train RandomForestClassifier for Studio Fan Model

# time, day,	studio_motion,	studio_temp
class_weights = {0: 10, 1: 27, 2: 4, 3: 40}  # Data Tuning: Assign a higher weight to classes with higher importance

model_st = RandomForestClassifier(oob_score=True, max_depth=8, random_state=90, class_weight=class_weights)
model_st.fit(X_train_st,y_train_st)

# Calculate estimated OOB (Out Of Bag) score
""" The OOB score can serve as a useful estimate of how well your random forest model is likely
to perform on unseen data without the need for a separate validation set.
However, it's still a good practice to use additional evaluation techniques
like cross-validation to assess the model's performance thoroughly. """

print(f'Studio Fan OOB SCORE: {model_st.oob_score_}')

Studio Fan OOB SCORE: 0.970873786407767


Model Selection, Evaluation and Training

In [34]:
# Check for NaN values in feature and target data
print("Missing values in X_train_st:", X_train_st.isnull().sum().sum())
print("Missing values in y_train_st:", y_train_st.isnull().sum())

# Optional: Print unique classes in y_train_st
print("Unique classes in y_train_st:", y_train_st.unique())


Missing values in X_train_st: 0
Missing values in y_train_st: 0
Unique classes in y_train_st: []


In [35]:
# Impute missing values for features (e.g., fill with mean or median)
X_train_st.fillna(X_train_st.mean(), inplace=True)


In [36]:
print(y_train_st.isnull().sum())  # Shows the number of NaN values in y_train_st


0


In [37]:
# Remove rows with NaN in the target variable
X_train_st = X_train_st[y_train_st.notnull()]
y_train_st = y_train_st[y_train_st.notnull()]


In [30]:
# Impute missing values in the target variable


In [29]:
X_train_st, y_train_st = X_train_st.align(y_train_st, join='inner', axis=0)


In [49]:
# init and train RandomForestClassifier for Studio Fan Model

# time, day,	studio_motion,	studio_temp
class_weights = {0: 10, 1: 27, 2: 4, 3: 40}  # Data Tuning: Assign a higher weight to classes with higher importance

model_st = RandomForestClassifier(oob_score=True, max_depth=8, random_state=90, class_weight=class_weights)
model_st.fit(X_train_st,y_train_st)

# Calculate estimated OOB (Out Of Bag) score
""" The OOB score can serve as a useful estimate of how well your random forest model is likely
to perform on unseen data without the need for a separate validation set.
However, it's still a good practice to use additional evaluation techniques
like cross-validation to assess the model's performance thoroughly. """

print(f'Studio Fan OOB SCORE: {model_st.oob_score_}')

Studio Fan OOB SCORE: 0.970873786407767


Tuning and Evaluation

In [50]:
# init and train RandomForestClassifier for Studio Fan Model

# time, day,	studio_motion,	studio_temp
class_weights = {0: 10, 1: 27, 2: 4, 3: 40}  # Data Tuning: Assign a higher weight to classes with higher importance

model_st = RandomForestClassifier(oob_score=True, max_depth=8, random_state=90, class_weight=class_weights)
model_st.fit(X_train_st,y_train_st)

# Calculate estimated OOB (Out Of Bag) score
""" The OOB score can serve as a useful estimate of how well your random forest model is likely
to perform on unseen data without the need for a separate validation set.
However, it's still a good practice to use additional evaluation techniques
like cross-validation to assess the model's performance thoroughly. """

print(f'Studio Fan OOB SCORE: {model_st.oob_score_}')


# init and train RandomForestClassifier for Dog Fan Model

# time, day,	studio_motion,	studio_temp
class_weights = {0: 30, 1: 20, 2: 4, 3: 40}  # Data Tuning: Example: Assign a higher weight (5) to class 1

model_do = RandomForestClassifier(oob_score=True, max_depth=8, random_state=91, class_weight=class_weights)
model_do.fit(X_train_do,y_train_do)

# Calculate estimated OOB (Out Of Bag) score
""" The OOB score can serve as a useful estimate of how well your random forest model is likely
to perform on unseen data without the need for a separate validation set.
However, it's still a good practice to use additional evaluation techniques
like cross-validation to assess the model's performance thoroughly. """

print(f'Dog Fan OOB SCORE: {model_do.oob_score_}')


Studio Fan OOB SCORE: 0.970873786407767
Dog Fan OOB SCORE: 0.9597780859916782


In [52]:
# Calculate Accuracy
from sklearn.metrics import accuracy_score, classification_report

# calculate studio fan state accuracy
y_pred_st = model_st.predict(X_test_st)
accuracy_st = accuracy_score(y_test_st, y_pred_st)
print(f'Studio Fan Accuracy: {accuracy_st}')

# calculate dog fan state accuracy
y_pred_do = model_do.predict(X_test_do)
accuracy_do = accuracy_score(y_test_do, y_pred_do)
print(f'Dog Fan Accuracy: {accuracy_do}')

# calculate and visualize the confusion matrix
""" A confusion matrix is a table used in machine learning to evaluate the performance of a classification model, showing true positives, true negatives, false positives, and false negatives."""

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

def plot_confusion_matrix(y_test, y_pred, label_name):
  # Generate a confusion matrix
  confusion = confusion_matrix(y_test, y_pred)

  # Calculate percentages for each cell in the confusion matrix
  total_samples = np.sum(confusion)
  confusion_percent = (confusion / total_samples) * 100

  # Plot the confusion matrix with percentages
  plt.figure(figsize=(4, 3))
  sns.heatmap(confusion_percent, annot=True, fmt='.2f', cmap='Blues', cbar=False, square=True)
  plt.xlabel('Predicted Labels : ' + label_name)
  plt.ylabel('Actual Labels')
  plt.title('Confusion Matrix (Percentages)')
  plt.show()




Studio Fan Accuracy: 0.9668508287292817
Dog Fan Accuracy: 0.9502762430939227


Model Download and Deployment

In [55]:
import joblib
from google.colab import files

# Download Studio Fan Model
joblib.dump(model_st, "studio_fan_model.joblib")
files.download("studio_fan_model.joblib")

# Download Studio Fan Model
joblib.dump(model_st, "dog_fan_model.joblib")
files.download("dog_fan_model.joblib")


# Download Studio Fan Model
joblib.dump(model_st, "studio_fan_model.joblib")
files.download("studio_fan_model.joblib")

# Download Studio Fan Model
joblib.dump(model_st, "dog_fan_model.joblib")
files.download("dog_fan_model.joblib")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>