In [1]:
#importing the necessary libraries
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [2]:
data = pd.read_csv("data.csv")

In [3]:
def preprocess_message(text):
    #converting the message to lower case so that thereis uniformity
    text = text.lower()
    #removing non alphabetic characters
    text = re.sub(r'[^\w\s]', '', text)
    return text

In [4]:
data['message'] = data['message'].apply(preprocess_message)

In [5]:
# TF-IDF Vectorization: Converts text data into numerical features based on word importance
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_messages = tfidf_vectorizer.fit_transform(data["message"]).toarray()

In [6]:
# Encoding the categorical data tail and species using label encoder
label_encoder_species = LabelEncoder()
label_encoder_tail = LabelEncoder()

data['tail'] = label_encoder_tail.fit_transform(data['tail'])  # Yes/No to 1/0
data['species'] = label_encoder_species.fit_transform(data['species'])  # Label encode species
test = pd.read_csv("test.csv")
test['tail'] = label_encoder_tail.fit_transform(test['tail'])
test1= pd.concat([pd.DataFrame(tfidf_messages), test[['fingers', 'tail']].reset_index(drop=True)], axis=1)
test1.columns = test1.columns.astype("str")

In [7]:
# Combine TF-IDF features with 'fingers' and 'tail' columns
X1= pd.concat([pd.DataFrame(tfidf_messages), data[['fingers', 'tail']].reset_index(drop=True)], axis=1)
y = data['species']

In [8]:
# Converting the column names to string datatype to avoid error
X1.columns = X1.columns.astype("str")

In [9]:
from imblearn.over_sampling import SMOTE
# Generates synthetic samples for the minority class to address class imbalance
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X1, y)

In [12]:
from sklearn.model_selection import RandomizedSearchCV
import scipy.stats as stats
from sklearn.linear_model import LogisticRegression
# Define the hyperparameter space
param_distributions = {
    'C': stats.uniform(0.01, 100),
    'penalty': ['l1', 'l2'],
    'solver': ['saga', 'lbfgs', 'liblinear'],
    'max_iter': [ 1000, 2000,3000,4000,5000]
}
classifier= LogisticRegression()
# Randomized search
random_search = RandomizedSearchCV(classifier, param_distributions, n_iter=50, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
random_search.fit(X_resampled, y_resampled)

# Best parameters and accuracy
print(f"Best parameters: {random_search.best_params_}")
print(f"Best accuracy: {random_search.best_score_}")

Fitting 5 folds for each of 50 candidates, totalling 250 fits


45 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1194, in fit
    solver = _check_solver(self.

Best parameters: {'C': 2.233776765919638, 'max_iter': 5000, 'penalty': 'l2', 'solver': 'saga'}
Best accuracy: 0.8666666666666666


In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
# After trying various models like adaboost, randomforest, xgboost, decisiontree etc we concluded that logistic regression is performing best
# for this data ( which is mostly due to its small size)
# Using the results of above RandomSearchCv we hypertuned this logistic regression model
classifier = LogisticRegression(C= 2.233776765919638, max_iter= 5000, penalty='l2', solver= 'saga')
classifier.fit(X_resampled, y_resampled)
accuracy=cross_val_score(classifier,X_resampled,y_resampled,cv=5).mean()
print(accuracy)

0.8666666666666666


In [25]:
X_resampled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,628,629,630,631,632,633,634,635,fingers,tail
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,0


In [26]:
y_resampled.head()

0    0
1    9
2    9
3    4
4    3
Name: species, dtype: int32

In [27]:
test1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,628,629,630,631,632,633,634,635,fingers,tail
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0


In [28]:
test1.isna().sum()

0            0
1            0
2            0
3            0
4            0
          ... 
633          0
634          0
635          0
fingers    201
tail       201
Length: 638, dtype: int64

In [29]:
# since the test data has nan values in fingers as well as tail we did simple imputation using meadian
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
test2 = imputer.fit_transform(test1)
test2 = pd.DataFrame(test2, columns=test1.columns)

In [30]:
y_pred=classifier.predict(test2)
y_pred

array([0, 9, 9, 4, 3, 6, 6, 3, 2, 2, 8, 2, 6, 0, 6, 3, 4, 4, 1, 0, 8, 7,
       6, 4, 5, 8, 7, 7, 6, 6, 7, 2, 6, 8, 7, 1, 8, 8, 7, 1, 3, 8, 6, 4,
       4, 6, 5, 6, 9, 4, 2, 5, 7, 0, 1, 3, 0, 2, 7, 8, 3, 1, 6, 7, 8, 9,
       9, 4, 8, 7, 3, 4, 8, 5, 3, 7, 1, 7, 7, 3, 7, 1, 7, 0, 3, 5, 7, 6,
       2, 0, 9, 9, 7, 3, 6, 7, 6, 8, 1, 6, 3, 5, 8, 7, 7, 4, 7, 6, 8, 3,
       2, 8, 6, 9, 3, 7, 9, 3, 8, 1, 7, 8, 7, 7, 7, 7, 2, 2, 3, 4, 8, 7,
       7, 6, 6, 0, 2, 4, 6, 6, 2, 0, 3, 6, 7, 5, 2, 7, 7, 8, 2, 2, 0, 4,
       8, 5, 1, 7, 1, 3, 3, 6, 7, 1, 9, 3, 7, 1, 8, 3, 8, 4, 9, 2, 2, 9,
       6, 6, 6, 2, 7, 3, 7, 8, 7, 6, 2, 3, 7, 0, 4, 9, 6, 9, 0, 1, 9, 1,
       2, 5, 7, 7, 9, 6, 8, 8, 3, 6, 0, 0, 9, 6, 5, 6, 2, 7, 6, 8, 4, 6,
       2, 8, 0, 0, 5, 9, 2, 9, 7, 4, 7, 8, 9, 3, 1, 3, 3, 0, 3, 5, 3, 8,
       7, 8, 7, 8, 9, 2, 3, 0, 9, 3, 6, 9, 0, 7, 2, 0, 3, 4, 6, 3, 9, 0,
       4, 6, 8, 4, 5, 8, 2, 7, 3, 7, 3, 5, 9, 2, 1, 3, 6, 5, 8, 0, 0, 8,
       2, 0, 7, 4, 7, 1, 2, 4, 4, 9, 8, 1, 1, 8, 6,

In [31]:
# Decoding the predicted output back to the species names
decoded_data = label_encoder_species.inverse_transform(y_pred)
print(decoded_data)

['Aquari' 'Zorblax' 'Zorblax' 'Florian' 'Faerix' 'Nexoon' 'Nexoon'
 'Faerix' 'Emotivor' 'Emotivor' 'Sentire' 'Emotivor' 'Nexoon' 'Aquari'
 'Nexoon' 'Faerix' 'Florian' 'Florian' 'Cybex' 'Aquari' 'Sentire'
 'Quixnar' 'Nexoon' 'Florian' 'Mythron' 'Sentire' 'Quixnar' 'Quixnar'
 'Nexoon' 'Nexoon' 'Quixnar' 'Emotivor' 'Nexoon' 'Sentire' 'Quixnar'
 'Cybex' 'Sentire' 'Sentire' 'Quixnar' 'Cybex' 'Faerix' 'Sentire' 'Nexoon'
 'Florian' 'Florian' 'Nexoon' 'Mythron' 'Nexoon' 'Zorblax' 'Florian'
 'Emotivor' 'Mythron' 'Quixnar' 'Aquari' 'Cybex' 'Faerix' 'Aquari'
 'Emotivor' 'Quixnar' 'Sentire' 'Faerix' 'Cybex' 'Nexoon' 'Quixnar'
 'Sentire' 'Zorblax' 'Zorblax' 'Florian' 'Sentire' 'Quixnar' 'Faerix'
 'Florian' 'Sentire' 'Mythron' 'Faerix' 'Quixnar' 'Cybex' 'Quixnar'
 'Quixnar' 'Faerix' 'Quixnar' 'Cybex' 'Quixnar' 'Aquari' 'Faerix'
 'Mythron' 'Quixnar' 'Nexoon' 'Emotivor' 'Aquari' 'Zorblax' 'Zorblax'
 'Quixnar' 'Faerix' 'Nexoon' 'Quixnar' 'Nexoon' 'Sentire' 'Cybex' 'Nexoon'
 'Faerix' 'Mythron' 'Sentire'

In [32]:
decoded_data

array(['Aquari', 'Zorblax', 'Zorblax', 'Florian', 'Faerix', 'Nexoon',
       'Nexoon', 'Faerix', 'Emotivor', 'Emotivor', 'Sentire', 'Emotivor',
       'Nexoon', 'Aquari', 'Nexoon', 'Faerix', 'Florian', 'Florian',
       'Cybex', 'Aquari', 'Sentire', 'Quixnar', 'Nexoon', 'Florian',
       'Mythron', 'Sentire', 'Quixnar', 'Quixnar', 'Nexoon', 'Nexoon',
       'Quixnar', 'Emotivor', 'Nexoon', 'Sentire', 'Quixnar', 'Cybex',
       'Sentire', 'Sentire', 'Quixnar', 'Cybex', 'Faerix', 'Sentire',
       'Nexoon', 'Florian', 'Florian', 'Nexoon', 'Mythron', 'Nexoon',
       'Zorblax', 'Florian', 'Emotivor', 'Mythron', 'Quixnar', 'Aquari',
       'Cybex', 'Faerix', 'Aquari', 'Emotivor', 'Quixnar', 'Sentire',
       'Faerix', 'Cybex', 'Nexoon', 'Quixnar', 'Sentire', 'Zorblax',
       'Zorblax', 'Florian', 'Sentire', 'Quixnar', 'Faerix', 'Florian',
       'Sentire', 'Mythron', 'Faerix', 'Quixnar', 'Cybex', 'Quixnar',
       'Quixnar', 'Faerix', 'Quixnar', 'Cybex', 'Quixnar', 'Aquari',
       'Faeri

In [33]:
test

Unnamed: 0,message,fingers,tail
0,iephyr terram nimbus terram faunar foliar,2,0
1,joyzor uleex luvium caloox shockus blissae,4,1
2,aquos arbor ventuc,4,1
3,nympha nympha epikoz nympha mythox mythox mythox,3,0
4,diitax sibenix fabulon,4,1
...,...,...,...
294,fabuion drakos lorix relikum cyclopix,4,1
295,codex cybrex algorix synapz mechan nanozom dotax,5,1
296,centarex mythox nympha krakos,3,1
297,orbitaz astron glixx novara novrm ufox qcasar ...,3,0


In [35]:
df = pd.DataFrame(decoded_data, columns=['Predictions'])
df.columns = ["species"]
# Save the DataFrame to a CSV file with index
df.to_csv("result_ssr.csv",index=True)

In [36]:
result = pd.read_csv("result_ssr.csv")
result

Unnamed: 0.1,Unnamed: 0,species
0,0,Aquari
1,1,Zorblax
2,2,Zorblax
3,3,Florian
4,4,Faerix
...,...,...
495,495,Sentire
496,496,Quixnar
497,497,Quixnar
498,498,Aquari


In [37]:
#renaming the columns to index, species
result.columns = ["index","species"]
result

Unnamed: 0,index,species
0,0,Aquari
1,1,Zorblax
2,2,Zorblax
3,3,Florian
4,4,Faerix
...,...,...
495,495,Sentire
496,496,Quixnar
497,497,Quixnar
498,498,Aquari


In [39]:
result.to_csv("result_ssr.csv",index=True)