In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Classification

In [2]:
data = pd.read_csv('fish.csv')

X = data.drop(columns=['Species'])

# Target variable ('Species')
y = data['Species']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = model.predict(X_test_scaled)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
classification_report_str = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:\n", classification_report_str)


Accuracy: 0.88
Classification Report:
               precision    recall  f1-score   support

       Bream       1.00      1.00      1.00        10
      Parkki       1.00      1.00      1.00         1
       Perch       0.69      1.00      0.82         9
        Pike       1.00      1.00      1.00         3
       Roach       0.00      0.00      0.00         1
       Smelt       1.00      1.00      1.00         5
   Whitefish       0.00      0.00      0.00         3

    accuracy                           0.88        32
   macro avg       0.67      0.71      0.69        32
weighted avg       0.79      0.88      0.82        32



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


1. **Accuracy**: The overall accuracy of the classification model is **0.88** (or 88%). This means that approximately 88% of the predictions made by the model are correct.

2. **Precision**:
   - Precision measures the proportion of true positive predictions (correctly predicted instances) out of all positive predictions (both true positives and false positives).
   - For each class (species):
       - Bream: 100% precision (all predicted Bream instances are correct).
       - Parkki: 100% precision.
       - Perch: 69% precision (some false positives).
       - Pike: 100% precision.
       - Roach: 0% precision (all predicted Roach instances are incorrect).
       - Smelt: 100% precision.
       - Whitefish: 0% precision.
   - Weighted average precision: 79%

3. **Recall (Sensitivity)**:
   - Recall measures the proportion of true positive predictions out of all actual positive instances.
   - For each class:
       - Bream: 100% recall (all actual Bream instances are correctly predicted).
       - Parkki: 100% recall.
       - Perch: 100% recall (no false negatives).
       - Pike: 100% recall.
       - Roach: 0% recall (all actual Roach instances are missed).
       - Smelt: 100% recall.
       - Whitefish: 0% recall (all actual Whitefish instances are missed).
   - Weighted average recall: 88%

4. **F1-Score**:
   - The F1-score balances precision and recall, providing a single metric.
   - It considers both false positives and false negatives.
   - Weighted average F1-score: 82%

5. **Support**:
   - The number of instances (samples) for each class.
   - For example, there are 10 instances of Bream, 1 instance of Parkki, and so on.

6. **Macro Average**:
   - The average precision, recall, and F1-score across all classes (unweighted).
   - Macro average precision: 67%
   - Macro average recall: 71%
   - Macro average F1-score: 69%

7. **Weighted Average**:
   - The average precision, recall, and F1-score, weighted by the number of instances in each class.
   - Weighted average precision: 79%
   - Weighted average recall: 88%
   - Weighted average F1-score: 82%

In summary, the model performs well in some classes (e.g., Bream, Parkki, Pike, Smelt) but struggles with others (e.g., Roach, Whitefish). Improving precision and recall for the challenging classes would enhance overall model performance. 📊🐟

In [3]:
from imblearn.over_sampling import SMOTE  # For addressing class imbalance

# Features (excluding 'Species')
X = data.drop(columns=['Species'])

# Target variable ('Species')
y = data['Species']

# Address class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = model.predict(X_test_scaled)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
classification_report_str = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:\n", classification_report_str)

Accuracy: 0.77
Classification Report:
               precision    recall  f1-score   support

       Bream       1.00      0.88      0.93        16
      Parkki       0.71      1.00      0.83         5
       Perch       1.00      0.29      0.45        17
        Pike       1.00      1.00      1.00        10
       Roach       0.44      0.80      0.57        10
       Smelt       0.78      1.00      0.88         7
   Whitefish       0.75      0.86      0.80        14

    accuracy                           0.77        79
   macro avg       0.81      0.83      0.78        79
weighted avg       0.85      0.77      0.76        79



1. **Accuracy (Overall Performance)**:
   - The overall accuracy of the model is **0.77** (or 77%).
   - This means that approximately 77% of the predictions made by the model are correct.

2. **Precision**:
   - Precision measures the proportion of true positive predictions (correctly predicted instances) out of all positive predictions (both true positives and false positives).
   - For each class (species):
       - Bream: 100% precision (all predicted Bream instances are correct).
       - Parkki: 71% precision.
       - Perch: 100% precision (some false positives).
       - Pike: 100% precision.
       - Roach: 44% precision (some false positives).
       - Smelt: 78% precision.
       - Whitefish: 75% precision.

3. **Recall (Sensitivity)**:
   - Recall measures the proportion of true positive predictions out of all actual positive instances.
   - For each class:
       - Bream: 88% recall (some actual Bream instances are missed).
       - Parkki: 100% recall.
       - Perch: 29% recall (many false negatives).
       - Pike: 100% recall.
       - Roach: 80% recall.
       - Smelt: 100% recall.
       - Whitefish: 86% recall.

4. **F1-Score**:
   - The F1-score balances precision and recall, providing a single metric.
   - It considers both false positives and false negatives.
   - Weighted average F1-score: 76%

5. **Macro Average**:
   - The average precision, recall, and F1-score across all classes (unweighted).
   - Macro average precision: 81%
   - Macro average recall: 83%
   - Macro average F1-score: 78%

6. **Weighted Average**:
   - The average precision, recall, and F1-score, weighted by the number of instances in each class.
   - Weighted average precision: 85%
   - Weighted average recall: 77%
   - Weighted average F1-score: 76%

In summary, the model's performance has slightly decreased compared to the previous accuracy of 0.88. However, this makes the model more generalized. It still performs well for some classes but struggles with others. Improving recall for classes like Perch and Roach could enhance overall model effectiveness. 📊🐟

In [4]:
import pickle

# Save the model to disk
filename = './models/finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [5]:
# Load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))

# Use the loaded model to make predictions
y_pred = loaded_model.predict(X_test_scaled)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
classification_report_str = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:\n", classification_report_str)

Accuracy: 0.77
Classification Report:
               precision    recall  f1-score   support

       Bream       1.00      0.88      0.93        16
      Parkki       0.71      1.00      0.83         5
       Perch       1.00      0.29      0.45        17
        Pike       1.00      1.00      1.00        10
       Roach       0.44      0.80      0.57        10
       Smelt       0.78      1.00      0.88         7
   Whitefish       0.75      0.86      0.80        14

    accuracy                           0.77        79
   macro avg       0.81      0.83      0.78        79
weighted avg       0.85      0.77      0.76        79



In [9]:
# X_test_scaled = scaler.transform(X_test)
X_test

Unnamed: 0,Weight,Length1,Length2,Length3,Height,Width
78,78.000000,16.800000,18.700000,19.400000,5.199200,3.123400
274,384.220176,29.182110,31.370550,34.664770,9.449847,5.305398
246,553.915243,40.384355,43.302505,46.126706,7.779587,5.144507
55,270.000000,23.600000,26.000000,28.700000,8.380400,4.247600
387,785.753632,33.279212,35.891166,39.156375,11.187795,5.856610
...,...,...,...,...,...,...
361,473.947581,27.301271,29.776807,32.703415,10.165772,5.995808
82,110.000000,19.000000,21.000000,22.500000,5.692500,3.555000
114,700.000000,34.500000,37.000000,39.400000,10.835000,6.264600
3,363.000000,26.300000,29.000000,33.500000,12.730000,4.455500


In [10]:
# User input
print("Please enter the following features: Weight, Length1, Length2, Length3, Height, Width")
user_input = [float(x) for x in input().split()]

# Convert user input into a numpy array and reshape it
user_input = np.array(user_input).reshape(1, -1)

# Standardize user input
user_input_scaled = scaler.transform(user_input)

# Use the loaded model to make predictions
user_pred = loaded_model.predict(user_input_scaled)

print(f"Predicted class: {user_pred[0]}")

Please enter the following features: Weight, Length1, Length2, Length3, Height, Width
Predicted class: Roach




In [13]:
# Features (excluding 'Species')
X = data.drop(columns=['Species'])

# Target variable ('Species')
y = data['Species'].astype(str)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, y_train)

# Make probability predictions
y_prob = model.predict_proba(X_test_scaled)

threshold = 0.3
y_pred = (y_prob[:, 2] > threshold) | (y_prob[:, 4] > threshold)  # Perch and Roach indices

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:\n", classification_report_str)

Accuracy: 0.00
Classification Report:
               precision    recall  f1-score   support

       Bream       1.00      1.00      1.00        10
      Parkki       1.00      1.00      1.00         1
       Perch       0.69      1.00      0.82         9
        Pike       1.00      1.00      1.00         3
       Roach       0.00      0.00      0.00         1
       Smelt       1.00      1.00      1.00         5
   Whitefish       0.00      0.00      0.00         3

    accuracy                           0.88        32
   macro avg       0.67      0.71      0.69        32
weighted avg       0.79      0.88      0.82        32



# Regression

In [8]:
# Features (excluding 'Weight')
X_reg = data.drop(columns=['Weight'])

# Convert categorical variables into numerical form
X_reg = pd.get_dummies(X_reg, columns=['Species'])

# Target variable ('Weight')
y_reg = data['Weight']

# Split data into training and testing sets
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

# Train a linear regression model
reg_model = LinearRegression()
reg_model.fit(X_train_reg, y_train_reg)

# Make weight predictions
y_pred_reg = reg_model.predict(X_test_reg)

# Evaluate model performance (Root Mean Squared Error)
rmse = mean_squared_error(y_test_reg, y_pred_reg, squared=False)

print(f"Root Mean Squared Error: {rmse:.2f} grams")


Root Mean Squared Error: 83.71 grams
