In [1]:
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# Load the data
data_dict = pickle.load(open('./data.pickle', 'rb'))

# Inspect the structure of data_dict['data']
print("Type of data_dict['data']:", type(data_dict['data']))
print("Types of the first 5 elements:", [type(x) for x in data_dict['data'][:5]])
print("Lengths of the first 5 elements:", [len(x) if hasattr(x, '__len__') else None for x in data_dict['data'][:5]])
print("First 3 entries of data:", data_dict['data'][:3])

# Filter out any lists that do not have exactly 42 elements
filtered_data = [x for x in data_dict['data'] if isinstance(x, list) and len(x) == 42]

# Check if any data was removed
if len(filtered_data) != len(data_dict['data']):
    print(f"Warning: {len(data_dict['data']) - len(filtered_data)} entries were removed due to incorrect length.")

# Convert the filtered lists into a 2D NumPy array
try:
    data = np.array(filtered_data)
    print("Data successfully converted to NumPy array with shape:", data.shape)
except Exception as e:
    print("Error converting data to NumPy array:", e)
    data = None  # Ensure data is None if conversion fails

# Proceed if data conversion is successful
if data is not None:
    # Convert labels to NumPy array
    labels = np.asarray(data_dict['labels'])

    # Filter the labels to match the filtered data
    filtered_labels = [label for idx, label in enumerate(labels) if isinstance(data_dict['data'][idx], list) and len(data_dict['data'][idx]) == 42]

    # Ensure data and labels are ready for train-test split
    if len(data) == len(filtered_labels):
        x_train, x_test, y_train, y_test = train_test_split(data, filtered_labels, test_size=0.2, shuffle=True, stratify=filtered_labels)

        # Train RandomForestClassifier
        model = RandomForestClassifier()
        model.fit(x_train, y_train)

        # Predict and evaluate
        y_predict = model.predict(x_test)
        score = accuracy_score(y_predict, y_test)
        print('{}% of samples were classified correctly!'.format(score * 100))

        # Save the model
        with open('model.pb', 'wb') as f:
            pickle.dump({'model': model}, f)
    else:
        print("Mismatch between the number of data samples and filtered labels.")
else:
    print("Data conversion failed. Exiting.")


Type of data_dict['data']: <class 'list'>
Types of the first 5 elements: [<class 'list'>, <class 'list'>, <class 'list'>, <class 'list'>, <class 'list'>]
Lengths of the first 5 elements: [42, 42, 42, 42, 42]
First 3 entries of data: [[0.11293485760688782, 0.491838276386261, 0.20519322156906128, 0.3897280693054199, 0.2475191354751587, 0.23560413718223572, 0.2345067858695984, 0.10632643103599548, 0.22647720575332642, 0.0, 0.1775798201560974, 0.13742226362228394, 0.18164026737213135, 0.05477467179298401, 0.18387502431869507, 0.15036144852638245, 0.18537810444831848, 0.2392238974571228, 0.11826035380363464, 0.15482264757156372, 0.12728255987167358, 0.05828997492790222, 0.13879206776618958, 0.17819789052009583, 0.15079328417778015, 0.27689129114151, 0.060322850942611694, 0.18652307987213135, 0.06681153178215027, 0.09455269575119019, 0.09211060404777527, 0.21838432550430298, 0.11077490448951721, 0.31219834089279175, 0.0, 0.23511448502540588, 0.019329935312271118, 0.16408848762512207, 0.04997

In [2]:
import pickle
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# Load data
data_dict = pickle.load(open('./data.pickle', 'rb'))

# Filter out data points that do not have consistent length
filtered_data = [x for x in data_dict['data'] if isinstance(x, list) and len(x) == 42]  # Assuming 42 is the expected length
filtered_labels = [label for i, label in enumerate(data_dict['labels']) if isinstance(data_dict['data'][i], list) and len(data_dict['data'][i]) == 42]

# Convert filtered data and labels into NumPy arrays
data = np.array(filtered_data)
labels = np.array(filtered_labels)

# Ensure that the number of samples in data and labels matches
if len(data) == len(labels):
    # Split the data into training and testing sets
    x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, shuffle=True, stratify=labels)

    # Initialize the SVM model with RBF kernel
    model = SVC(kernel='rbf')

    # Train the model
    model.fit(x_train, y_train)

    # Make predictions
    y_predict = model.predict(x_test)

    # Calculate the accuracy score
    score = accuracy_score(y_predict, y_test)

    # Output the accuracy
    print('{}% of samples were classified correctly!'.format(score * 100))

    # Save the model
    with open('model.pb', 'wb') as f:
        pickle.dump({'model': model}, f)
else:
    print("Mismatch between the number of filtered data samples and labels.")


99.29742388758783% of samples were classified correctly!
