In [25]:
import numpy as np
import pandas as pd
import statistics

from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

### Importing the datasets

In [26]:
train = pd.read_csv('exemple_code.csv', sep=';')
X_test = pd.read_csv('mot_code.csv', sep=';')

### Data preprocessing

Remove the last column of X_test since it is empty

In [27]:
X_test = X_test.iloc[:, :-1]

Separate the data into X and y

In [28]:
X_train = train.drop(columns=['y'], axis=1)
y_train = train['y']

Encoding categorical data

In [29]:
labelencoder = LabelEncoder()
y_train = labelencoder.fit_transform(y_train)

Fit the model

In [30]:
dt = DecisionTreeClassifier(criterion='entropy', random_state=0)
dt.fit(X_train, y_train)

Predict the test set results

In [31]:
y_pred = dt.predict(X_test)

Inverse transform the predictions

In [32]:
y_pred = labelencoder.inverse_transform(y_pred)

Reshape the result to (10, 16) since each word is encoded into 16 numbers, 10 times.

In [33]:
y_pred = y_pred.reshape(10, 16)

Calculate the mode for every column

In [34]:
modes = [statistics.mode(column) for column in y_pred.transpose()]

Find the secret word

In [35]:
code_map = {"04": "a", "08": "b", "12": "c", "16": "d", "20": "e",
            "24": "f", "28": "g", "32": "h", "36": "i", "40": "j",
            "44": "k", "48": "l", "52": "m", "56": "n", "60": "o",
            "64": "p", "68": "q", "72": "r", "76": "s", "80": "t",
            "84": "u", "88": "v", "92": "w", "96": "x"}

secret_word = ""
for i in range(0, len(modes), 2):
    secret_word += code_map[str(modes[i]) + str(modes[i + 1])]
print(secret_word)

toulouse


Knowing the answer is "toulouse", we will build y_test in order to evaluate the model

In [36]:
y_test = np.array(list("toulouse"))
y_test = [list(code_map.keys())[list(code_map.values()).index(c)]
          for c in y_test]
y_test = np.array([[int(c[0]), int(c[1])] for c in y_test])
y_test = y_test.flatten()
y_test = np.array([y_test for _ in range(10)])

Evaluate the model

In [37]:
print("Confusion matrix:")
print(confusion_matrix(y_test.flatten(), y_pred.flatten()))
print("Accuracy score:")
print(accuracy_score(y_test.flatten(), y_pred.flatten()))
print("Classification report:")
print(classification_report(y_test.flatten(), y_pred.flatten(), zero_division=0))

Confusion matrix:
[[27  1  2  3  0  1  2  0  3  1]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  9  0  0  1  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  2  0  0 25  0  0  1  1  1]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  1  1  0  1 25  1  0  1]
 [ 1  3  1  1  0  0  0  4  0  0]
 [ 7  0  3  2  0  0  5  0 23  0]
 [ 0  0  0  0  0  0  0  0  0  0]]
Accuracy score:
0.70625
Classification report:
              precision    recall  f1-score   support

           0       0.77      0.68      0.72        40
           1       0.00      0.00      0.00         0
           2       0.56      0.90      0.69        10
           3       0.00      0.00      0.00         0
           4       1.00      0.83      0.91        30
           5       0.00      0.00      0.00         0
           6       0.78      0.83      0.81        30
           7       0.67      0.40      0.50        10
           8       0.85      0.57      0.69        40
           9       0.00      0.00      0.00         0

    accu

The accuracy is 70.625%

We will now try to improve the model by using GridSearchCV to find the best parameters.

In [43]:
# Define the parameters
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [2, 4, 6, 8, 10, 12, 14, 16],
    'min_samples_split': np.arange(2, 11, 2),
    'min_samples_leaf': np.arange(6),
}

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=dt,
                           param_grid=param_grid,
                           cv=10,
                           n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters
print("Best parameters:")
print(grid_search.best_params_)

Best parameters:
{'criterion': 'entropy', 'max_depth': 6, 'min_samples_leaf': 5, 'min_samples_split': 2}


800 fits failed out of a total of 4800.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
800 fits failed with the following error:
Traceback (most recent call last):
  File "/home/omar/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/omar/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 1144, in wrapper
    estimator._validate_params()
  File "/home/omar/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "/home/omar/anaconda3/lib/python3.11/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constr

In [45]:
# Predict the test set results
y_pred = grid_search.predict(X_test)

# Inverse transform the predictions
y_pred = labelencoder.inverse_transform(y_pred)

# Reshape the result to (10, 16)
y_pred = y_pred.reshape(10, 16)

# Calculate the mode for every column
modes = [statistics.mode(column) for column in y_pred.transpose()]

# Find the secret word
secret_word = ""
for i in range(0, len(modes), 2):
    secret_word += code_map[str(modes[i]) + str(modes[i + 1])]
print(secret_word)

toulouse


We got the correct answer

In [44]:
# Use previously generated y_test to evaluate the model
print("Confusion matrix:")
print(confusion_matrix(y_test.flatten(), y_pred.flatten()))
print("Accuracy score:")
print(accuracy_score(y_test.flatten(), y_pred.flatten()))
print("Classification report:")
print(classification_report(y_test.flatten(), y_pred.flatten(), zero_division=0))

Confusion matrix:
[[29  0  0  2  0  0  3  0  4  2]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  9  0  0  0  0  0  0  1]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  3  1  0 23  0  0  1  1  1]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  3  0  0  2 25  0  0  0]
 [ 1  3  0  0  0  0  0  6  0  0]
 [ 1  0  6  3  0  0  3  0 27  0]
 [ 0  0  0  0  0  0  0  0  0  0]]
Accuracy score:
0.74375
Classification report:
              precision    recall  f1-score   support

           0       0.94      0.72      0.82        40
           1       0.00      0.00      0.00         0
           2       0.47      0.90      0.62        10
           3       0.00      0.00      0.00         0
           4       1.00      0.77      0.87        30
           5       0.00      0.00      0.00         0
           6       0.81      0.83      0.82        30
           7       0.86      0.60      0.71        10
           8       0.84      0.68      0.75        40
           9       0.00      0.00      0.00         0

    accu

The accuracy is 74.375%, not a big improvement, but the precision of each class is better