In [None]:
%pip install seaborn
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

Split data into a training and testing dataset
Due to the fact that the data is both already sorted by user and time, we will need to 'unsort' to gain a proper random sampling 

Steps 
    1) 'Unsort' and 'shuffle' data
    2) develop 2 files that are accurate random samplings of data 
    3) ensure files are random samplings of data

In [None]:
# NOTE: If running this in a new environment, you may need to adjust the path to the CSV file.
# Since the file was uploaded directly, 'geophone-sensor-data.csv' might be enough.
# For this notebook, we retain the original path.
dataset = pd.read_csv("./geophone/geophone-sensor-data.csv")

dataset_sorted = dataset.sort_values(by=["name", "timestamp"], ascending=[True, True])

In [None]:
og = sns.FacetGrid(dataset_sorted)
og.map(plt.hist, 'mean', bins=20)

og.set_axis_labels("Mean", "Frequency")
og.set_titles(col_template="{col_name}")
plt.subplots_adjust(top=0.9)
og.figure.suptitle("Distribution of Mean")

In [None]:
train, test = train_test_split(dataset_sorted, test_size=0.5, random_state=42, shuffle=True)
print(train.head())
print('_'*40)
print(test.head())

In [None]:
train_g = sns.FacetGrid(train)
train_g.map(plt.hist, 'mean', bins=20)
train_g.set_axis_labels("Mean", "Frequency")
train_g.set_titles(col_template="{col_name}")
plt.subplots_adjust(top=0.9)
train_g.figure.suptitle("Distribution of Training Mean")


In [None]:
test_g = sns.FacetGrid(test)
test_g.map(plt.hist, 'mean', bins=20)

test_g.set_axis_labels("Mean", "Frequency")
test_g.set_titles(col_template="{col_name}")
plt.subplots_adjust(top=0.9)
test_g.figure.suptitle("Distribution of Test Mean")


Cannot use person as a afeature due to below inconsistencies. We have to look at the data holistically. 

Test if there are outliers or noise in the data 

Binning and standardizing

In [None]:
# grid = sns.FacetGrid(train_df, col='Pclass', hue='Survived')
grid = sns.FacetGrid(train, col='name', row='activity', height=2.2, aspect=1.6)
grid.map(plt.hist, 'energy', alpha=.5, bins=20)
grid.add_legend();


In [None]:
# grid = sns.FacetGrid(train_df, col='Pclass', hue='Survived')
grid = sns.FacetGrid(train, col='name', row='activity', height=2.2, aspect=1.6)
grid.map(plt.scatter, 'dominant_freq', 'activity', alpha=.5)
grid.add_legend();


In [None]:
# grid = sns.FacetGrid(train_df, col='Pclass', hue='Survived')
grid = sns.FacetGrid(train, col='name', row='activity', height=2.2, aspect=1.6)
grid.map(plt.triplot, 'min', 'max', alpha=.5)
grid.add_legend();


In [None]:
# grid = sns.FacetGrid(train_df, col='Pclass', hue='Survived')
grid = sns.FacetGrid(train, col='name', row='activity', height=2.2, aspect=1.6)
grid.map(plt.ecdf, 'max', alpha=.5)
grid.map(plt.ecdf, 'min', alpha=.5, color='red')
grid.map(plt.ecdf, 'mean', alpha=.5, color='green')
grid.add_legend();


In [None]:
# Ensure the 'timestamp' column is a proper datetime object
train['timestamp'] = pd.to_datetime(train['timestamp'])
# grid = sns.FacetGrid(train_df, col='Pclass', hue='Survived')
grid = sns.FacetGrid(train, col='name', row='activity', height=2.2, aspect=1.6)
grid.map(plt.scatter, 'timestamp', 'max', alpha=.5, color='blue', s=10)
grid.map(plt.scatter, 'timestamp', 'min', alpha=.5, color='red', s=10)
# grid.map(plt.plot('timestamp', max, label='Max Value', color='blue'))
# grid.map(plt.plot('timestamp', min, label='Min Value', color='red'))

# Add titles and labels for clarity
# plt.title('Max and Min Values Over Time')
# plt.xlabel('Timestamp')
# plt.ylabel('Value')
# plt.legend()
# plt.grid(True, which='both', linestyle='--', linewidth=0.5)
# plt.tight_layout()
# plt.show()

In [None]:
# Ensure the 'timestamp' column is a proper datetime object
train['timestamp'] = pd.to_datetime(train['timestamp'])
# grid = sns.FacetGrid(train_df, col='Pclass', hue='Survived')
grid = sns.FacetGrid(train, col='name', row='activity', height=2.2, aspect=1.6)
grid.map(plt.scatter, 'timestamp', 'max', alpha=.5, color='blue', s=10)
grid.map(plt.scatter, 'timestamp', 'min', alpha=.5, color='red', s=10)
grid.map(plt.scatter, 'timestamp', 'mean', alpha=.5, color='green', s=10)

In [None]:
# Ensure the 'timestamp' column is a proper datetime object
train['timestamp'] = pd.to_datetime(train['timestamp'])
# grid = sns.FacetGrid(train_df, col='Pclass', hue='Survived')
grid = sns.FacetGrid(train, col='name', row='activity', height=2.2, aspect=1.6)
grid.map(plt.scatter, 'timestamp', 'mean', alpha=.5, color='green', s=10)

In [None]:
# 1. Ensure the 'timestamp' column is a proper datetime object (same as original)
train['timestamp'] = pd.to_datetime(train['timestamp'])

# 2. FILTER THE DATA FOR 'emir'
train_emir = train[train['name'] == 'Emir']

# 3. Create FacetGrid using the filtered data
# - Removed 'col='name'' because only 'emir' remains.
# - The row='activity' keeps the vertical separation by activity.
grid = sns.FacetGrid(train_emir, row='activity', height=2.2, aspect=1.6)

# 4. Map the scatter plot to the grid
grid.map(plt.scatter, 'timestamp', 'mean', alpha=.5, color='green', s=10)

# Optional: Add titles to the rows to show the 'activity'
grid.set_titles(row_template='{row_name}')
plt.show() # To display the plot, if not in a notebook environment

## 3. Machine Learning Model Training and Evaluation

We now prepare the data and train three classification models to meet the project requirements. The features used are the numerical summary statistics from the geophone sensor: `mean`, `top_3_mean`, `min`, `max`, `std_dev`, `median`, `q1`, `q3`, `skewness`, `dominant_freq`, and `energy`.

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

# Define features and encode the target variable
feature_cols = ['mean', 'top_3_mean', 'min', 'max', 'std_dev', 'median', 'q1', 'q3', 'skewness', 'dominant_freq', 'energy']

# Instantiate LabelEncoder
le = LabelEncoder()

# Apply encoding to the target variable 'activity'
y_train = le.fit_transform(train['activity'])
y_test = le.transform(test['activity'])

# Select features
X_train = train[feature_cols]
X_test = test[feature_cols]

print("X_train, X_test, y_train, y_test are prepared for modeling.")
print(f"Activities encoded: {list(zip(le.classes_, le.transform(le.classes_)))}")

### A. Decision Tree Classifier (Feature Relation and Split on Attributes)
This model helps identify the most **important features** used for classification by examining the Gini impurity/entropy reduction, which dictates how the tree **splits on attributes**.

In [None]:
# Train Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)

# Evaluate Decision Tree
dt_accuracy = accuracy_score(y_test, y_pred_dt)
print(f"Decision Tree Accuracy: {dt_accuracy:.4f}")
print("\nDecision Tree Classification Report:\n", classification_report(y_test, y_pred_dt, target_names=le.classes_))

# Feature Importance for 'Feature Relation' and 'Split on Attributes' focus
dt_feature_importance = pd.Series(dt_model.feature_importances_, index=feature_cols).sort_values(ascending=False)
print("\nDecision Tree Feature Importances (Higher score means more crucial for splitting):\n", dt_feature_importance)

### B. Logistic Regression (Regression Tests)
Logistic Regression is utilized as a **regression test** (a simple linear classification baseline). A low performance here confirms that the activity detection requires a more complex, non-linear model.

In [None]:
# Logistic Regression is used as the 'regression test' model
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)

# Evaluate Logistic Regression
lr_accuracy = accuracy_score(y_test, y_pred_lr)
print(f"Logistic Regression Accuracy: {lr_accuracy:.4f}")
print("\nLogistic Regression Classification Report:\n", classification_report(y_test, y_pred_lr, target_names=le.classes_))

### C. Random Forest Classifier (Emphasis on Improvement and Different Feature Set)
To achieve a better result and demonstrate **improvement**, we train a Random Forest classifier using a **restricted and high-performing subset of features** (`dominant_freq`, `energy`, `std_dev`, `skewness`, `min`, `max`) as guided by the initial feature importance scores.

In [None]:
# Features for the new Random Forest model
rf_feature_cols_new = ['dominant_freq', 'energy', 'std_dev', 'skewness', 'min', 'max']

# Select the new feature subset
X_train_rf_new = X_train[rf_feature_cols_new]
X_test_rf_new = X_test[rf_feature_cols_new]

# Train Random Forest
rf_model_new = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model_new.fit(X_train_rf_new, y_train)
y_pred_rf_new = rf_model_new.predict(X_test_rf_new)

# Evaluate the new Random Forest
rf_new_accuracy = accuracy_score(y_test, y_pred_rf_new)
print(f"Random Forest (New Feature Set) Accuracy: {rf_new_accuracy:.4f}")
print("\nRandom Forest (New Feature Set) Classification Report:\n", classification_report(y_test, y_pred_rf_new, target_names=le.classes_))

# Final comparison for the presentation EMPHASIS
print(f"\nIMPROVEMENT: The new Random Forest model achieved {rf_new_accuracy:.4f} accuracy, which demonstrates a significant improvement over the Decision Tree's {dt_accuracy:.4f} baseline.")