In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
import numpy as np

In [33]:
# Path to the original dataset
file_path ='C:/Users/Gehan/sensor_data.csv'

# Temporary file path for the cleaned data
cleaned_file_path = 'C:/Users/Gehan/cleaned_sensor_data.csv'

# Process the file to remove rows with incorrect number of commas
expected_commas = 11  # As there are 12 columns, we expect 11 commas
with open(file_path, 'r', encoding="utf-8") as file, open(cleaned_file_path, 'w', encoding="utf-8") as outfile:
    for line in file:
        if line.count(',') == expected_commas:
            outfile.write(line)

# Load the cleaned dataset
data = pd.read_csv(cleaned_file_path)

# Display the first few rows of the dataset
print(data.head())

   gyro_x  gyro_y  gyro_z  flex1  flex2  flex3  flex4  flex5  contact1  \
0  -29.09    3.73    7.27   0.58   1.30   1.81   0.91   0.90       0.0   
1  -29.09    3.73    7.27   0.58   1.30   1.81   0.91   0.90       0.0   
2  -29.09    3.74    7.26   0.60   1.31   1.80   0.91   0.91       0.0   
3  -29.09    3.74    7.26   0.60   1.31   1.80   0.91   0.91       0.0   
4  -29.09    3.74    7.24   0.57   1.31   1.81   0.91   0.91       0.0   

   contact2  contact3 char  
0       0.0       1.0    ا  
1       0.0       1.0    ا  
2       0.0       1.0    ا  
3       0.0       1.0    ا  
4       0.0       1.0    ا  


In [34]:
# Summary statistics
print(data.describe())

             gyro_x        gyro_y        gyro_z         flex1         flex2  \
count  97106.000000  97108.000000  97108.000000  97108.000000  97108.000000   
mean      -5.828310    -18.084606   -153.363960      0.840781      1.700128   
std       44.695019     72.074772    548.872241      2.666640      4.667013   
min    -2981.000000  -2733.000000 -26293.000000      0.000000      0.160000   
25%       -5.240000    -20.940000   -206.440000      0.540000      1.270000   
50%       -1.830000    -20.060000   -129.090000      0.620000      1.400000   
75%       -0.240000    -17.790000   -101.940000      0.970000      1.680000   
max     1139.000000   1501.000000    247.420000    139.000000    244.000000   

              flex3         flex4         flex5  contact1      contact2  \
count  97108.000000  97108.000000  97108.000000   96974.0  96966.000000   
mean       2.315228      1.439219      1.010773       0.0      0.000371   
std        6.879318      4.927896      3.330346       0.0      

In [35]:
# Check for missing values
print(data.isnull().sum())

# Handling missing values
data.dropna(inplace=True)

gyro_x        2
gyro_y        0
gyro_z        0
flex1         0
flex2         0
flex3         0
flex4         0
flex5         0
contact1    134
contact2    142
contact3    116
char          0
dtype: int64


In [42]:
# Encoding categorical variable 'char' if it's not numeric
if data['char'].dtype == 'object':
    char_to_int = {char: idx for idx, char in enumerate(data['char'].unique())}
    int_to_char = {idx: char for char, idx in char_to_int.items()}
    data['char'] = data['char'].map(char_to_int)

# Splitting the data into training and testing sets
X = data.drop('char', axis=1)
y = data['char']
columns_to_scale = [col for col in X.columns if not col.startswith('contact')]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling the features that need scaling
scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

X_train_scaled[columns_to_scale] = scaler.fit_transform(X_train[columns_to_scale])
X_test_scaled[columns_to_scale] = scaler.transform(X_test[columns_to_scale])

# Model training
model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, y_train)

# Test the model
predictions = model.predict(X_test_scaled)
# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print("Accuracy of the model: ", accuracy)

Accuracy of the model:  0.6787985317685985


In [43]:
# Initialize and train the Naive Bayes classifier
clf = GaussianNB()
clf.fit(X_train_scaled, y_train)

# Make predictions on the testing set
y_pred = clf.predict(X_test_scaled)

# Evaluate the model
print(accuracy_score(y_test, y_pred))

0.5168794912888384


In [52]:
# Initialize the Random Forest Classifier
random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
random_forest_model.fit(X_train, y_train)

predictions = random_forest_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy of the model with threshold: {accuracy}")

Accuracy of the model with threshold: 0.9630874218063382


In [47]:
# Initialize the Random Forest Classifier
random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
random_forest_model.fit(X_train, y_train)

# Predict probabilities
probabilities = random_forest_model.predict_proba(X_test)

# Define a threshold
threshold = 0.7  # You can adjust this based on your requirements

# Apply the threashold to determine labels
# We need to find the maximum probability from the probabilities array and check if it is below the threshold
max_probabilities = np.max(probabilities, axis=1)
predictions = np.where(max_probabilities < threshold, -1, random_forest_model.predict(X_test))  # Assuming -1 represents 'nothing'

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy of the model with threshold: {accuracy}")

Accuracy of the model with threshold: 0.941219045649589
