### Step 1: Data Preprocessing

In [33]:
import pandas as pd

# Load the dataset
iris_data = pd.read_csv("iris_dataset.csv")

# Add an 'Id' column
iris_data['Id'] = range(1, len(iris_data) + 1)

# Split features and labels
X = iris_data.drop(['Id', 'Species'], axis=1)  # Features
y = iris_data['Species']  # Labels

### Step 2: Model Selection

In [34]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Choose a model (Random Forest Classifier in this example)
model = RandomForestClassifier(n_estimators=100, random_state=42)

### Step 3: Training

In [35]:
model.fit(X_train, y_train)

### Step 4: Evaluation

In [36]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Generate a classification report and confusion matrix
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 1.0
Classification Report:
                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        10
Iris-versicolor       1.00      1.00      1.00         9
 Iris-virginica       1.00      1.00      1.00        11

       accuracy                           1.00        30
      macro avg       1.00      1.00      1.00        30
   weighted avg       1.00      1.00      1.00        30

Confusion Matrix:
[[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]


### Step 5: Fine-tuning (optional)

In [37]:
# Fine-tune hyperparameters or try different algorithms to improve performance
from sklearn.model_selection import GridSearchCV

# Define hyperparameters to search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform Grid Search Cross Validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best parameters found by Grid Search
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Re-train the model with the best parameters
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Evaluate the best model
y_pred_best = best_model.predict(X_test)
accuracy_best = accuracy_score(y_test, y_pred_best)
print("Best Model Accuracy:", accuracy_best)

Best Parameters: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
Best Model Accuracy: 1.0


### Step 6: Prediction

In [38]:
# Once satisfied with the model's performance, you can use it to make predictions on new data
# Example: Predicting on new data
new_data = pd.DataFrame({'Id': [1, 2, 3],
                         'SepalLengthCm': [5.1, 6.2, 4.9],
                         'SepalWidthCm': [3.5, 2.9, 3.1],
                         'PetalLengthCm': [1.4, 4.3, 1.5],
                         'PetalWidthCm': [0.2, 1.3, 0.2]})

new_data.set_index('Id', inplace=True)  # Set 'Id' as index
new_predictions = best_model.predict(new_data)
print("Predictions for new data:", new_predictions)

Predictions for new data: ['Iris-setosa' 'Iris-versicolor' 'Iris-setosa']
