In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

FILE = 'data.csv'
df = pd.read_csv(FILE)

df.head()

Unnamed: 0,message,fingers,tail,species
0,pluvia arbor aquos,4,no,Aquari
1,cosmix xeno nebuz odbitaz,5,yes,Zorblax
2,solarix glixx novum galaxum quasar,5,yes,Zorblax
3,arbor insectus pesros ekos dootix nimbus,2,yes,Florian
4,mermax drakos lorix epikoz deftax,4,no,Faerix


In [2]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer

from sklearn.base import TransformerMixin, BaseEstimator

# Custom transformer to take the first 3 letters of each word in the message
class TruncateWordsTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Apply transformation: Split by space, take first 3 letters of each word, rejoin
        return [' '.join([word[:3] for word in sentence.split()]) for sentence in X]

# Encode species labels
le = LabelEncoder()
df['species'] = le.fit_transform(df['species'])

# combine the yes no into preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('message_transform', Pipeline([
            ('truncate_words', TruncateWordsTransformer()),  # Custom word truncation
            ('tfidf', TfidfVectorizer())  # Apply TF-IDF after truncation
        ]), 'message'),
        ('fingers_scaler', StandardScaler(), ['fingers']),  # Standardize the "fingers" numeric feature
        ('tail_encoder', OneHotEncoder(), ['tail'])  # One-hot encode the "tail" binary/categorical feature
    ]
)

# Full pipeline including model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=20, random_state=42))
])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df[['message', 'fingers', 'tail']], df['species'], test_size=0.2, random_state=42, stratify=df['species'])

# Train the model
pipeline.fit(X_train, y_train)

# Predictions
predictions = pipeline.predict(X_test)

In [3]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(y_test, predictions))
print(confusion_matrix(y_test, predictions))

              precision    recall  f1-score   support

           0       0.78      0.78      0.78         9
           1       0.90      0.82      0.86        11
           2       1.00      1.00      1.00        11
           3       0.88      0.78      0.82         9
           4       0.89      0.80      0.84        10
           5       0.82      0.90      0.86        10
           6       0.80      0.89      0.84         9
           7       0.55      0.92      0.69        12
           8       1.00      0.89      0.94         9
           9       0.75      0.30      0.43        10

    accuracy                           0.81       100
   macro avg       0.84      0.81      0.81       100
weighted avg       0.83      0.81      0.80       100

[[ 7  0  0  0  1  0  0  1  0  0]
 [ 0  9  0  0  0  0  2  0  0  0]
 [ 0  0 11  0  0  0  0  0  0  0]
 [ 0  0  0  7  0  2  0  0  0  0]
 [ 2  0  0  0  8  0  0  0  0  0]
 [ 0  0  0  1  0  9  0  0  0  0]
 [ 0  1  0  0  0  0  8  0  0  0]
 [ 0  0  0

In [4]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid to search over
param_grid = {
    'classifier__n_estimators': [100, 200, 250],  # Trying different numbers of estimators
    'classifier__max_depth': [5, 10, 20, 50],  # Trying different max depths
    'classifier__max_features': ['sqrt', 'log2']  # Trying different numbers of features per split'
}

# Set up GridSearchCV with the pipeline and parameter grid
grid_search = GridSearchCV(pipeline, param_grid, cv=3, verbose=1)

# Train the model using grid search
grid_search.fit(X_train, y_train)

# Get the best parameters and model
best_model = grid_search.best_estimator_

# Make predictions using the best model
best_predictions = best_model.predict(X_test)

# Print the classification report
print(classification_report(y_test, best_predictions))

print(grid_search.best_params_)

Fitting 3 folds for each of 24 candidates, totalling 72 fits
              precision    recall  f1-score   support

           0       0.89      0.89      0.89         9
           1       0.82      0.82      0.82        11
           2       0.92      1.00      0.96        11
           3       0.88      0.78      0.82         9
           4       0.90      0.90      0.90        10
           5       0.82      0.90      0.86        10
           6       0.78      0.78      0.78         9
           7       0.65      0.92      0.76        12
           8       1.00      0.89      0.94         9
           9       0.80      0.40      0.53        10

    accuracy                           0.83       100
   macro avg       0.84      0.83      0.83       100
weighted avg       0.84      0.83      0.82       100

{'classifier__max_depth': 20, 'classifier__max_features': 'log2', 'classifier__n_estimators': 200}


In [5]:
# perform cross validation on the best model
from sklearn.model_selection import cross_val_score

# Perform 5-fold cross validation
scores = cross_val_score(best_model, df[['message', 'fingers', 'tail']], df['species'], cv=5)

# Print the mean, min, and max scores
print('Mean accuracy:', np.mean(scores))
print('Min accuracy:', np.min(scores))
print('Max accuracy:', np.max(scores))

Mean accuracy: 0.8219999999999998
Min accuracy: 0.77
Max accuracy: 0.9


In [8]:
# load the test dataframe
df_test = pd.read_csv('test.csv')

# Make predictions on the test data
test_predictions = best_model.predict(df_test[['message', 'fingers', 'tail']])

# Inverse transform the predictions to get the species names
test_predictions = le.inverse_transform(test_predictions)

result_df = pd.DataFrame({'species': test_predictions})

result_df.to_csv('result.csv', index=False)