In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

FILE = 'data.csv'
df = pd.read_csv(FILE)

df.head()

Unnamed: 0,message,fingers,tail,species
0,pluvia arbor aquos,4,no,Aquari
1,cosmix xeno nebuz odbitaz,5,yes,Zorblax
2,solarix glixx novum galaxum quasar,5,yes,Zorblax
3,arbor insectus pesros ekos dootix nimbus,2,yes,Florian
4,mermax drakos lorix epikoz deftax,4,no,Faerix


In [7]:
word_embeddings = pd.read_csv('word_embeddings.csv', index_col=0)

# index is the word and the remaining columns are the embeddings elements
# create a dict with the word as key and the embeddings as value
word_embeddings_dict = word_embeddings.T.to_dict('list')

{'nanobyt': [0.024847899,
  0.013658229,
  -0.037900627,
  -0.022817208,
  -0.035767447,
  0.045730244,
  0.034466516,
  0.005288638,
  0.018804137,
  0.047740523,
  -0.004359424,
  0.009731747,
  0.04037625,
  0.0074098706,
  -0.046923578,
  -0.019185413,
  0.042182874,
  0.0057216287,
  -0.02785033,
  -0.039220583,
  -0.00094970316,
  -0.025980389,
  -0.010218203,
  0.0036841035,
  0.00011490658,
  -0.037118852,
  0.03933264,
  -0.034793817,
  -0.04545753,
  -0.007981479,
  0.035712156,
  0.041328263],
 'kzakos': [-0.04573065,
  -0.00445316,
  -0.047460068,
  0.022347603,
  -0.021559525,
  -0.031214667,
  -0.040070117,
  0.041115012,
  0.041803565,
  -0.043690573,
  -0.040105045,
  -0.04875758,
  0.03352474,
  0.0064952,
  -0.017026614,
  0.030863795,
  -0.045859933,
  0.049562145,
  -0.018376492,
  0.03692067,
  -0.04003402,
  0.049054597,
  -0.005157195,
  -0.012140892,
  0.035066318,
  -0.01492212,
  0.033803847,
  0.032882977,
  -0.039657366,
  -0.034219705,
  -0.04200505,
  -0.0

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer

from sklearn.base import TransformerMixin, BaseEstimator

class WordsTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # return the mean of the embeddings for each word in each message 
        return np.array([
            np.mean([word_embeddings_dict[word] for word in message.split() if word in word_embeddings_dict] or [np.zeros(32)], axis=0)
            for message in X
        ])

df = pd.read_csv(FILE)

# Encode species labels
le = LabelEncoder()
df['species'] = le.fit_transform(df['species'])

# combine the yes no into preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('message_transform', Pipeline([
            ('word_to_sentence', WordsTransformer()),  # Custom word truncation
        ]), 'message'),
        ('fingers_scaler', StandardScaler(), ['fingers']),  # Standardize the "fingers" numeric feature
        ('tail_encoder', OneHotEncoder(), ['tail'])  # One-hot encode the "tail" binary/categorical feature
    ]
)

# Full pipeline including model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=20, random_state=42))
])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df[['message', 'fingers', 'tail']], df['species'], test_size=0.2, random_state=42, stratify=df['species'])

# Train the model
pipeline.fit(X_train, y_train)

# Predictions
predictions = pipeline.predict(X_test)

In [10]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(y_test, predictions))
print(confusion_matrix(y_test, predictions))

              precision    recall  f1-score   support

           0       0.55      0.67      0.60         9
           1       0.60      0.82      0.69        11
           2       0.73      0.73      0.73        11
           3       0.75      0.33      0.46         9
           4       0.67      0.80      0.73        10
           5       0.56      0.90      0.69        10
           6       1.00      0.33      0.50         9
           7       0.53      0.67      0.59        12
           8       0.86      0.67      0.75         9
           9       0.67      0.40      0.50        10

    accuracy                           0.64       100
   macro avg       0.69      0.63      0.62       100
weighted avg       0.68      0.64      0.63       100

[[6 0 0 0 1 0 0 1 1 0]
 [0 9 0 0 1 1 0 0 0 0]
 [0 1 8 1 0 1 0 0 0 0]
 [1 2 1 3 0 2 0 0 0 0]
 [1 0 0 0 8 0 0 1 0 0]
 [0 0 0 0 1 9 0 0 0 0]
 [0 3 0 0 1 2 3 0 0 0]
 [2 0 0 0 0 0 0 8 0 2]
 [0 0 2 0 0 1 0 0 6 0]
 [1 0 0 0 0 0 0 5 0 4]]


In [11]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid to search over
param_grid = {
    'classifier__n_estimators': [100, 200, 250],  # Trying different numbers of estimators
    'classifier__max_depth': [5, 10, 20, 50],  # Trying different max depths
    'classifier__max_features': ['sqrt', 'log2']  # Trying different numbers of features per split'
}

# Set up GridSearchCV with the pipeline and parameter grid
grid_search = GridSearchCV(pipeline, param_grid, cv=3, verbose=1)

# Train the model using grid search
grid_search.fit(X_train, y_train)

# Get the best parameters and model
best_model = grid_search.best_estimator_

# Make predictions using the best model
best_predictions = best_model.predict(X_test)

# Print the classification report
print(classification_report(y_test, best_predictions))

print(grid_search.best_params_)

Fitting 3 folds for each of 24 candidates, totalling 72 fits
              precision    recall  f1-score   support

           0       0.70      0.78      0.74         9
           1       0.75      0.82      0.78        11
           2       0.92      1.00      0.96        11
           3       1.00      0.44      0.62         9
           4       0.89      0.80      0.84        10
           5       0.64      0.90      0.75        10
           6       0.86      0.67      0.75         9
           7       0.59      0.83      0.69        12
           8       1.00      0.67      0.80         9
           9       0.56      0.50      0.53        10

    accuracy                           0.75       100
   macro avg       0.79      0.74      0.74       100
weighted avg       0.78      0.75      0.75       100

{'classifier__max_depth': 10, 'classifier__max_features': 'sqrt', 'classifier__n_estimators': 200}


In [12]:
from sklearn.model_selection import cross_val_score

# Perform 5-fold cross validation
scores = cross_val_score(best_model, df[['message', 'fingers', 'tail']], df['species'], cv=5)

# Print the mean, min, and max scores
print('Mean accuracy:', np.mean(scores))
print('Min accuracy:', np.min(scores))
print('Max accuracy:', np.max(scores))

Mean accuracy: 0.744
Min accuracy: 0.68
Max accuracy: 0.81
