In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score


In [2]:
df = pd.read_csv("D:\! Proyectos\ML_number_class\dataset\data_v1.csv")
df.head()

Unnamed: 0,Number,Label
0,1,Non
1,2,Non
2,3,Fizz
3,4,Non
4,5,Buzz


The numbers will be converted to a binary representation. This can make the data more suitable for classification algorithms, since multiples of 3 and 5 can have recognizable patterns in their binary representation.

In [3]:
# Function to convert numbers to their binary representation
def binary_encode(i, num_digits):
    return np.array([i >> d & 1 for d in range(num_digits)])

# Number of digits in the binary representation (for 1000 we need 10 digits)
num_digits = 10

# Applying the binary encoding function to the 'Number' column
binary_representation = df['Number'].apply(lambda x: binary_encode(x, num_digits))

# Creating a DataFrame from the binary matrix
binary_df_v1 = pd.DataFrame(binary_representation.tolist(), columns=[f'Bit_{i}' for i in range(num_digits)])

# Concatenating the binary DataFrame with the original
data_v1_transformed = pd.concat([df, binary_df_v1], axis=1)

# Saving new data
data_v1_transformed.to_csv("D:\! Proyectos\ML_number_class\dataset\data_v2.csv", index = False)

# Displaying the first rows of the transformed DataFrame
data_v1_transformed.head()

Unnamed: 0,Number,Label,Bit_0,Bit_1,Bit_2,Bit_3,Bit_4,Bit_5,Bit_6,Bit_7,Bit_8,Bit_9
0,1,Non,1,0,0,0,0,0,0,0,0,0
1,2,Non,0,1,0,0,0,0,0,0,0,0
2,3,Fizz,1,1,0,0,0,0,0,0,0,0
3,4,Non,0,0,1,0,0,0,0,0,0,0
4,5,Buzz,1,0,1,0,0,0,0,0,0,0


The base case will be established with the numbers directly and without cross-validation.

In [4]:
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Extracting training data (numbers from 101 to 1000)
training_data = data_v1_transformed[df['Number'] > 100]

# Extracting validation data (numbers from 1 to 100)
validation_data = data_v1_transformed[df['Number'] <= 100]

# Splitting the features and labels for training data
X_train = training_data[['Number']]
y_train = training_data['Label']

# Splitting the features and labels for validation data
X_test = validation_data[['Number']]
y_test = validation_data['Label']

# Label codification
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

trained_models = {}
results = {}

classifiers = [
    ('Logistic Regression', LogisticRegression(max_iter=1000)),
    ('Support Vector Machine', SVC()),
    ('Decision Tree', DecisionTreeClassifier()),
    ('Random Forest', RandomForestClassifier()),
    ('K-Neighbors', KNeighborsClassifier()),
    ('XGBoost', XGBClassifier())
]

for name, clf in classifiers:
    clf.fit(X_train, y_train_encoded)
    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test_encoded, y_pred)
    results[name] = acc
    
    trained_models[name] = clf

    print(f'{name}: {acc:.2f}')

Logistic Regression: 0.53
Support Vector Machine: 0.53
Decision Tree: 0.53
Random Forest: 0.53
K-Neighbors: 0.53
XGBoost: 0.53


Apparently the algorithms are not learning to classify the patterns of the data set. Specific predictions will be reviewed to investigate the problem.

In [5]:
y_pred_encoded = trained_models['Logistic Regression'].predict(X_test)


y_pred = label_encoder.inverse_transform(y_pred_encoded)


predictions_vs_true = pd.DataFrame({'True Label': y_test, 'Predicted Label': y_pred})
predictions_vs_true.head(20)

Unnamed: 0,True Label,Predicted Label
0,Non,Non
1,Non,Non
2,Fizz,Non
3,Non,Non
4,Buzz,Non
5,Fizz,Non
6,Non,Non
7,Non,Non
8,Fizz,Non
9,Buzz,Non


The predictions reveal that the model is consistently predicting the class "Non" for all instances in the validation set. This explains why the precision is about 0.53, since the "None" class represents about 53% of the labels in the validation set.

Next, the algorithms will be trained with the binary representation.

In [6]:
# Selecting the new feature
X_train = training_data[['Bit_0', 'Bit_1', 'Bit_2', 'Bit_3', 'Bit_4', 'Bit_5', 'Bit_6', 'Bit_7', 'Bit_8', 'Bit_9']]
X_test = validation_data[['Bit_0', 'Bit_1', 'Bit_2', 'Bit_3', 'Bit_4', 'Bit_5', 'Bit_6', 'Bit_7', 'Bit_8', 'Bit_9']]

In [7]:
trained_models2 = {}
results2 = {}

for name, clf in classifiers:
    clf.fit(X_train, y_train_encoded)
    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test_encoded, y_pred)
    results2[name] = acc
    
    trained_models2[name] = clf

    print(f'{name}: {acc:.2f}')

Logistic Regression: 0.53
Support Vector Machine: 0.53
Decision Tree: 0.23
Random Forest: 0.32
K-Neighbors: 0.25
XGBoost: 0.39


In [8]:
y_pred_encoded = trained_models['Decision Tree'].predict(X_test)


y_pred = label_encoder.inverse_transform(y_pred_encoded)


predictions_vs_true = pd.DataFrame({'True Label': y_test, 'Predicted Label': y_pred})
predictions_vs_true.head(20)

Unnamed: 0,True Label,Predicted Label
0,Non,Non
1,Non,Fizz
2,Fizz,Non
3,Non,Buzz
4,Buzz,Fizz
5,Fizz,Non
6,Non,Non
7,Non,Non
8,Fizz,FizzBuzz
9,Buzz,Non


It seems that some models began to learn relationships in the dataset but with very poor results. Let's perform a correlation study between the dependent variables and the target variable.

In [9]:
from scipy.stats import pointbiserialr

# Correcting the mapping for "Non" label
label_encoding = {"Non": 0, "Fizz": 1, "Buzz": 2, "FizzBuzz": 3}
data_v1_transformed['Label_encoded'] = data_v1_transformed['Label'].map(label_encoding)

# Calculating the point biserial correlation again between the binary representation and the encoded label
correlations_with_label_corrected = {col: pointbiserialr(data_v1_transformed[col], data_v1_transformed['Label_encoded']).correlation
                                     for col in ['Number','Bit_0', 'Bit_1', 'Bit_2', 'Bit_3', 'Bit_4', 'Bit_5', 'Bit_6', 'Bit_7', 'Bit_8', 'Bit_9']}

correlations_with_label_corrected


{'Number': 0.0036109082768049777,
 'Bit_0': 0.0010785089618482685,
 'Bit_1': 0.0010785089618482737,
 'Bit_2': 0.0010785089618482772,
 'Bit_3': 0.007978952922198747,
 'Bit_4': -0.0012252253884994211,
 'Bit_5': 0.0012146950828284813,
 'Bit_6': 0.005529775306020345,
 'Bit_7': 0.0055297753060203295,
 'Bit_8': 0.001214695082828503,
 'Bit_9': 0.0012146950828284936}

These low correlation values may explain why previously trained models did not perform well using these features.

Since traditional classifiers are not learning the patterns in the data set, a neural network will be trained, specifically an MLP. Neural networks tend to learn numerical relationships well.

In [11]:
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.neural_network import MLPClassifier


df = pd.read_csv("D:\! Proyectos\ML_number_class\dataset\data_v2.csv")



In [12]:

# Extracting training data (numbers from 101 to 1000)
training_data = df[df['Number'] > 100]

# Extracting validation data (numbers from 1 to 100)
validation_data = df[df['Number'] <= 100]

# Splitting the features and labels for training data
X_train = training_data[['Bit_0', 'Bit_1', 'Bit_2', 'Bit_3', 'Bit_4', 'Bit_5', 'Bit_6', 'Bit_7', 'Bit_8', 'Bit_9']]
y_train = training_data['Label']

# Splitting the features and labels for validation data
X_test = validation_data[['Bit_0', 'Bit_1', 'Bit_2', 'Bit_3', 'Bit_4', 'Bit_5', 'Bit_6', 'Bit_7', 'Bit_8', 'Bit_9']]
y_test = validation_data['Label']

# Label codification
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)



In [19]:
# Create a MLP classifier
mlp_clf = MLPClassifier(hidden_layer_sizes=(200, 50), max_iter=10000, random_state=42)

# Fit the model to the training data
mlp_clf.fit(X_train, y_train_encoded)

# Predict the labels for the validation data
y_pred_mlp = mlp_clf.predict(X_test)

# Calculate the accuracy of the MLP
mlp_accuracy = accuracy_score(y_test_encoded, y_pred_mlp)

print(f'MLP Accuracy: {mlp_accuracy:.2f}')


MLP Accuracy: 0.96


In [20]:
y_pred_encoded = mlp_clf.predict(X_test)


y_pred = label_encoder.inverse_transform(y_pred_encoded)


predictions_vs_true = pd.DataFrame({'True Label': y_test, 'Predicted Label': y_pred})
predictions_vs_true.head(20)

Unnamed: 0,True Label,Predicted Label
0,Non,Non
1,Non,Non
2,Fizz,Fizz
3,Non,Non
4,Buzz,Buzz
5,Fizz,Fizz
6,Non,Non
7,Non,Non
8,Fizz,Fizz
9,Buzz,Buzz


We already have an acceptable model, so we will proceed to make the service app.