In [3]:
# Below as comments in the next cell results are explained
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.datasets import load_iris
import pandas as pd

data = load_iris()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['species'] = data.target
df.to_csv("iris.csv", index=False)

# Preprocess the Data
# Load the data from CSV
df = pd.read_csv("iris.csv")

# Check for missing values
print("Missing values in dataset:\n", df.isnull().sum())

# Standardize the features
X = df.drop('species', axis=1)  # Features
y = df['species']  # Target variable

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Model Selection
# Initialize models
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(alpha=1.0),
    "Random Forest Regressor": RandomForestRegressor(n_estimators=100, random_state=5)
}

# Train and Evaluate Each Model on the Test Set
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=5)

# Train and evaluate each model
test_results = {}

for name, model in models.items():
    # Train the model on the training set
    model.fit(X_train, y_train)
    # Predict on the test set
    y_pred = model.predict(X_test)
    # Evaluate Mean Squared Error
    test_mse = mean_squared_error(y_test, y_pred)
    test_results[name] = test_mse
    print(f"{name} - Test set Mean Squared Error: {test_mse:.4f}")


# Select the best model based on test set performance
best_model_name = min(test_results, key=test_results.get)
best_model = models[best_model_name]
print(f"\nBest model based on test set: {best_model_name}")

Missing values in dataset:
 sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
species              0
dtype: int64
Linear Regression - Test set Mean Squared Error: 0.0723
Ridge Regression - Test set Mean Squared Error: 0.0696
Random Forest Regressor - Test set Mean Squared Error: 0.0542

Best model based on test set: Random Forest Regressor


In [None]:
# Missing values in dataset:
# sepal length (cm)    0
# sepal width (cm)     0
# petal length (cm)    0
# petal width (cm)     0
# species              0
# dtype: int64
# As everything is 0 means no emepty or incomplete data field

# Performance of each model as Mean Squared Error meaning sum of (predicted value - actual value)^2 divided by no of times predicted:
# Linear Regression - Test set Mean Squared Error: 0.0723
# Ridge Regression - Test set Mean Squared Error: 0.0696
# Random Forest Regressor - Test set Mean Squared Error: 0.0542

# Best model based on test set: Random Forest Regressor

# The best model to use is (based on this test) is Random Forest Regressor