In [143]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from datetime import datetime, timedelta
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

In [144]:
# Load dataset
data = pd.read_csv('/content/dataset-1.csv')
data.reset_index(drop=True, inplace=True)

In [145]:
data.head(5)

Unnamed: 0,ph,temperature,turbidity,fish
0,6.0,27.0,4.0,katla
1,7.6,28.0,5.9,sing
2,7.8,27.0,5.5,sing
3,6.5,31.0,5.5,katla
4,8.2,27.0,8.5,prawn


In [146]:
# Get distinct categorical values from the "fish" column
distinct_fish_species = data['fish'].unique()

# Print the distinct values
for fish_species in distinct_fish_species:
    print(fish_species)

katla
sing
prawn
rui
koi
pangas
tilapia
silverCup
karpio
magur
shrimp


In [147]:
#Checking for null values
print(data.isna().sum())

ph             0
temperature    0
turbidity      0
fish           0
dtype: int64


In [148]:
original_class_counts = data['fish'].value_counts()
original_class_counts

fish
tilapia      129
rui           99
pangas        78
katla         58
silverCup     55
shrimp        50
sing          49
karpio        33
koi           15
prawn         14
magur         11
Name: count, dtype: int64

## Dataset balancing

In [149]:
from imblearn.over_sampling import RandomOverSampler

# Separate features and target variable
X = data[['ph', 'temperature', 'turbidity']]
y = data['fish']

# Instantiate the RandomOverSampler
oversampler = RandomOverSampler(sampling_strategy='auto', random_state=42)

# Fit and transform the dataset to oversample the minority classes
X_resampled, y_resampled = oversampler.fit_resample(X, y)

# Create a new balanced DataFrame
data = pd.concat([X_resampled, y_resampled], axis=1)

balanced_class_counts = data['fish'].value_counts()
balanced_class_counts

fish
katla        129
sing         129
prawn        129
rui          129
koi          129
pangas       129
tilapia      129
silverCup    129
karpio       129
magur        129
shrimp       129
Name: count, dtype: int64

In [150]:
# Define feature matrix X and target vector Y
X = data[['ph', 'temperature', 'turbidity']]
Y = data['fish']
# Split data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [151]:
# Train decision tree classification model
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, Y_train)

In [152]:
# Train KNeighborsClassifier
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, Y_train)

In [153]:
# Train RandomForestClassifier
rf_model = RandomForestClassifier()
rf_model.fit(X_train, Y_train)

In [154]:
# Using Support Vector
svc_model = SVC()
svc_model.fit(X_train, Y_train)

In [155]:
#Using GaussianNB
gau_model = GaussianNB()
gau_model.fit(X_train, Y_train)

In [156]:
# Create a table to store performance metrics
pf_table = {
    'Model': [],
    'Accuracy': []
}

In [157]:
# Evaluate DecisionTreeClassifier
dt_predictions = dt_model.predict(X_test)

dt_accuracy = accuracy_score(Y_test, dt_predictions)
pf_table["Model"].append("DecisionTreeClassifier")
pf_table["Accuracy"].append(dt_accuracy)

In [158]:
# Evaluate KNeighborsClassifier
knn_predictions = knn_model.predict(X_test)

knn_accuracy = accuracy_score(Y_test, knn_predictions)
pf_table["Model"].append("KNeighborsClassifier")
pf_table["Accuracy"].append(knn_accuracy)

In [159]:
# Evaluate RandomForestClassifier
rf_predictions = rf_model.predict(X_test)

rf_accuracy = accuracy_score(Y_test, rf_predictions)
pf_table["Model"].append("RandomForestClassifier")
pf_table["Accuracy"].append(rf_accuracy)

In [160]:
# Evaluate Support Vector
svc_predictions = svc_model.predict(X_test)

svc_accuracy = accuracy_score(Y_test, svc_predictions)
pf_table["Model"].append("Support Vector")
pf_table["Accuracy"].append(svc_accuracy)

In [161]:
# Evaluate GaussianNB
gau_predictions = gau_model.predict(X_test)

gau_accuracy = accuracy_score(Y_test, gau_predictions)
pf_table["Model"].append("GaussianNB")
pf_table["Accuracy"].append(gau_accuracy)

In [162]:
# Convert performance metrics to DataFrame
pf_metrics = pd.DataFrame(pf_table)
print(pf_metrics)

                    Model  Accuracy
0  DecisionTreeClassifier  0.978873
1    KNeighborsClassifier  0.908451
2  RandomForestClassifier  0.978873
3          Support Vector  0.496479
4              GaussianNB  0.517606


In [168]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef,classification_report, cohen_kappa_score

y_pred = dt_model.predict(X_test)

cm_test = confusion_matrix(Y_test, y_pred)
print("Confusion Matrix:")
print(cm_test)

# Calculate other metrics
accuracy = accuracy_score(Y_test, y_pred)
precision = precision_score(Y_test, y_pred, average='weighted')
recall = recall_score(Y_test, y_pred, average='weighted')
f1 = f1_score(Y_test, y_pred, average='weighted')
mcc = matthews_corrcoef(Y_test, y_pred)
kappa = cohen_kappa_score(Y_test, y_pred)

print("Accuracy:", round(accuracy * 100, 2), "%")
print("Precision:", round(precision * 100, 2), "%")
print("Recall:", round(recall * 100, 2), "%")
print("F1-Score:", round(f1 * 100, 2), "%")
print("Matthews Correlation Coefficient:", round(mcc * 100, 2), "%")
print("Kappa Statistic:", round(kappa * 100, 2), "%")

Confusion Matrix:
[[28  0  0  0  0  0  0  0  0  0  0]
 [ 0 22  2  0  0  0  0  0  0  0  0]
 [ 0  0 17  0  0  0  0  0  0  0  0]
 [ 0  0  0 25  0  0  0  0  0  0  0]
 [ 0  1  0  0 29  1  0  0  0  0  0]
 [ 0  0  0  0  0 32  0  0  0  0  0]
 [ 0  0  0  0  0  0 23  0  0  0  0]
 [ 0  0  0  0  0  0  0 24  0  0  0]
 [ 0  0  0  0  0  0  0  0 26  0  0]
 [ 0  0  0  0  0  1  0  0  0 31  0]
 [ 0  0  1  0  0  0  0  0  0  0 21]]
Accuracy: 97.89 %
Precision: 98.07 %
Recall: 97.89 %
F1-Score: 97.91 %
Matthews Correlation Coefficient: 97.68 %
Kappa Statistic: 97.67 %


In [169]:
print("Classification report")
print(classification_report(Y_test,y_pred))

Classification report
              precision    recall  f1-score   support

      karpio       1.00      1.00      1.00        28
       katla       0.96      0.92      0.94        24
         koi       0.85      1.00      0.92        17
       magur       1.00      1.00      1.00        25
      pangas       1.00      0.94      0.97        31
       prawn       0.94      1.00      0.97        32
         rui       1.00      1.00      1.00        23
      shrimp       1.00      1.00      1.00        24
   silverCup       1.00      1.00      1.00        26
        sing       1.00      0.97      0.98        32
     tilapia       1.00      0.95      0.98        22

    accuracy                           0.98       284
   macro avg       0.98      0.98      0.98       284
weighted avg       0.98      0.98      0.98       284



## Predict fish species for a given aquatic environment

In [163]:
ph = 6.7
temp = 24.3
turb = 14.1

features = [[ph, temp, turb]]

# Predict the values
predicted_values = dt_model.predict(features)

# Print the predicted fish species
print("Predicted fish species:", predicted_values[0])

Predicted fish species: karpio




## Save the model into pickle file

In [171]:
import pickle
with open('dt_model_pkl', 'wb') as files:
    pickle.dump(rf_model, files)
# pickle.dump(rf_model, open('rf_model.pkl','wb'))

## Predict more possible fish species for given aquatic environment

In [164]:
# Predict fish species for the given pH value
ph_value = 7.9
temp_value = 33.5
turb_value = 13.2

data2 = pd.read_csv('/content/dataset-1.csv')

# Get all possible fish species for the given pH value
possible_species1 = set(data2[data2['ph'] == ph_value]['fish'])
possible_species2 = set(data2[data2['temperature'] == temp_value]['fish'])
possible_species3 = set(data2[data2['turbidity'] == turb_value]['fish'])

# Combine multiple sets and convert to a list
combined_species = list(possible_species1 | possible_species2 | possible_species3)

print(f'Possible Species: {combined_species}')

Possible Species: ['tilapia', 'shrimp', 'sing', 'rui', 'prawn']


## Predict aquatic environment for a given fish species

In [165]:
fish_name = "rui";

# Find the samples in the dataset with the same fish label
suitable_data = data[data['fish'] == fish_name]

# Calculate the pH range for the selected fish
min_ph = suitable_data['ph'].min()
max_ph = suitable_data['ph'].max()

# Calculate the temperature range for the selected fish
min_temp = suitable_data['temperature'].min()
max_temp = suitable_data['temperature'].max()

# Calculate the turbidity range for the selected fish
min_turb = suitable_data['turbidity'].min()
max_turb = suitable_data['turbidity'].max()

print("Inputed fish species : ",fish_name)
print("PH : ",min_ph,"-",max_ph)
print("Temperature : ",min_temp,"-",max_temp)
print("Turbidity : ",min_turb,"-",max_turb)

Inputed fish species :  rui
PH :  6.0 - 8.9
Temperature :  20.87 - 35.0
Turbidity :  3.31 - 7.5
