In [226]:
# Imports
import pandas as pd
from sklearn.datasets import make_blobs
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
import joblib
from sklearn.metrics import accuracy_score


In [227]:
# Variables
RANDOM_STATE = 42
TARGET_FEATURE = 'Label'
TARGET_FEATURE_MULTICLASS = 'Label_multiclass'

In [228]:
# Syntazise new data
# Load original datasets
original_df = pd.read_csv('../explore/data/exported_features.csv')

# Define parameters for synthetic data generations
n_samples = 500
n_features = original_df.shape[1] -1
n_classes = len(original_df['Label'].unique())
# Calculate centers based on the original data
centers = original_df.groupby('Label').mean().values

print("Original dataset:")
print(original_df)

# Create new dataset
X, y = make_blobs(
    n_samples=n_samples, 
    n_features=n_features,
    cluster_std=1.0,
    centers=centers,
    random_state=RANDOM_STATE)

# Create a DataFrame for binary labels
df = pd.DataFrame(X, columns=original_df.columns[:-1])

# Map the generated labels back to original multiclass labels
# Create a mapping from integer labels to original multiclass labels
unique_labels = original_df['Label'].unique()
label_mapping = {i: label for i, label in enumerate(unique_labels)}

# Apply the mapping to get multiclass labels for the generated dataset
df['Label'] = [label_mapping[int(label)] for label in y]

print("\nGenerated DataFrame with Multiclass Labels:")
print(df)

Original dataset:
       area    perimeter  aspect_ratio  ...  centroid_x  centroid_y  Label
0    3572.5  2442.584033      1.204244  ...       361.0       518.0      r
1    3466.5  2417.162680      1.468354  ...       346.0       468.0      r
2    3617.5  2492.736272      1.412651  ...       334.0       428.0      r
3    4157.0  2674.179046      0.973469  ...       338.0       534.0      r
4    4088.0  2623.263184      0.686071  ...       311.0       539.0      r
..      ...          ...           ...  ...         ...         ...    ...
128  5793.0  3708.187396      0.900172  ...       294.0       471.0      s
129  6574.0  4065.369082      0.793548  ...       286.0       462.0      s
130  5604.0  3562.566788      0.491409  ...       289.0       480.0      s
131  4872.5  3473.294713      1.794702  ...       317.0       427.0      s
132  4798.0  3430.223643      1.774086  ...       313.0       434.0      s

[133 rows x 18 columns]

Generated DataFrame with Multiclass Labels:
            

In [229]:
features = ['perimeter', 'solidity', 'circularity', 'eccentricity', 'major_axis_length', 'minor_axis_length', 'Label']
continuous_features = ['perimeter', 'solidity', 'circularity', 'eccentricity', 'major_axis_length', 'minor_axis_length']
target_feature = 'Label'

# Split target feature from the rest
X = df.drop('Label', axis=1)
y = df['Label']

In [230]:
# Define pipeline for preprocessing, scale data, select features and convert labels to binary
preprocessor = ColumnTransformer(
    transformers=[
        ('num', RobustScaler(), continuous_features)
    ],
    remainder='drop'
)

# Define preprocessing pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
    ])

# Fit the pipeline
normalized_data = pipeline.fit_transform(df)

# Put results in a DataFrame
normalized_df = pd.DataFrame(normalized_data, columns=continuous_features)

# Encode the target feature
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Add the encoded target feature back
normalized_df['Label'] = y

# Add the original target feature back
normalized_df['Label_multiclass'] = df['Label']

normalized_df

Unnamed: 0,perimeter,solidity,circularity,eccentricity,major_axis_length,minor_axis_length,Label,Label_multiclass
0,0.327736,0.329684,0.168100,-1.700196,0.300138,-0.863581,2,s
1,-0.674807,0.146769,1.401043,0.663221,-0.696332,0.152698,0,p
2,-0.000071,1.140172,0.583931,-0.358256,-0.003597,0.004280,1,r
3,0.329937,0.629332,0.235701,0.938422,0.311982,-0.870057,2,s
4,0.000778,-0.369615,0.964055,0.116196,0.012100,0.003520,1,r
...,...,...,...,...,...,...,...,...
495,-0.673611,0.005778,0.868948,-1.042706,-0.693002,0.184775,0,p
496,-0.672861,0.161190,-0.632218,-0.788419,-0.704673,0.182754,0,p
497,0.000959,-0.163932,-0.417887,-0.736742,0.004292,-0.009752,1,r
498,0.000262,0.360918,1.300630,-0.160125,-0.007095,0.008257,1,r


In [231]:
# Make predictions with the model
# Load the model
model = joblib.load('deployed/hand_gesture_decision_tree.pkl')

# Make predictions
predictions = model.predict(normalized_df.drop([TARGET_FEATURE, TARGET_FEATURE_MULTICLASS], axis=1))

# Add predictions to the DataFrame
normalized_df['Prediction'] = predictions

print("\nPredictions:")
print(normalized_df)



Predictions:
     perimeter  solidity  circularity  ...  Label  Label_multiclass  Prediction
0     0.327736  0.329684     0.168100  ...      2                 s           0
1    -0.674807  0.146769     1.401043  ...      0                 p           0
2    -0.000071  1.140172     0.583931  ...      1                 r           0
3     0.329937  0.629332     0.235701  ...      2                 s           2
4     0.000778 -0.369615     0.964055  ...      1                 r           1
..         ...       ...          ...  ...    ...               ...         ...
495  -0.673611  0.005778     0.868948  ...      0                 p           0
496  -0.672861  0.161190    -0.632218  ...      0                 p           0
497   0.000959 -0.163932    -0.417887  ...      1                 r           0
498   0.000262  0.360918     1.300630  ...      1                 r           0
499  -0.000575 -1.562286     0.673530  ...      1                 r           1

[500 rows x 9 columns]


In [232]:
# Check the accuracy of the model
true_labels = normalized_df[TARGET_FEATURE]
accuracy = accuracy_score(true_labels, normalized_df['Prediction'])

print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.33
