In [1]:
# Importing necessary libraries
import numpy as np
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import joblib

In [2]:
# Step 1: Load the data
iris = load_iris()
X, y = iris.data, iris.target

In [3]:
# Step 2: Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Step 3: Define a pipeline with data preprocessing and the model
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Data scaling
    ('clf', LogisticRegression(solver='liblinear'))  # Logistic regression
])

In [5]:
len(np.logspace(-4, 4, 20)) # 40 candidates in total, 20 from C and 2 from penalty.

20

In [6]:
# Step 4: Define a grid of hyperparameters for tuning
param_grid = {
    'clf__C': np.logspace(-4, 4, 20),  # Regularization strength
    'clf__penalty': ['l1', 'l2']  # Norm used in the penalization
}

In [7]:
# Step 5: Create a GridSearchCV object to find the best parameters
grid_search = GridSearchCV(pipeline, param_grid, cv=5, verbose=1, scoring='accuracy')
# Verbose=1 will print out the progress

In [8]:
# Step 6: Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 40 candidates, totalling 200 fits


In [9]:
# Step 7: Print out the best parameters and the best score
print("Best parameters found:", grid_search.best_params_)
print("Best cross-validated score:", grid_search.best_score_)

Best parameters found: {'clf__C': 11.288378916846883, 'clf__penalty': 'l1'}
Best cross-validated score: 0.9583333333333334


In [10]:
# Step 8: Retrieve the best model from grid search
best_model = grid_search.best_estimator_

In [11]:
# Step 9: Make predictions using the test set
y_pred = best_model.predict(X_test)

In [12]:
# Step 10: Evaluate the model's performance
cv_scores = cross_val_score(best_model, X_train, y_train, cv=5)
print(f"Cross-validation scores: {cv_scores}")
print(f"Average CV Score: {cv_scores.mean()}")
print()
print("Accuracy on test set:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Cross-validation scores: [0.95833333 1.         0.875      1.         0.95833333]
Average CV Score: 0.9583333333333334

Accuracy on test set: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30


In [13]:
# Step 12: Predicting on new data
new_data = np.array([[5.1, 3.5, 1.4, 0.2]])
new_prediction = best_model.predict(new_data)
print("Prediction for the new data:", new_prediction)

Prediction for the new data: [0]


In [14]:
# Step 13: Save the model to a file for future use
joblib.dump(best_model, 'best_logistic_model.pkl')

['best_logistic_model.pkl']

After you save your trained model to a file using joblib.dump, you would typically perform the following steps depending on your project needs:

1. Deployment: If the model's performance is satisfactory, you may deploy it to a production environment where it can start making predictions on new, unseen data. This can involve setting up a REST API, using a model serving platform, or integrating it directly into an application.

1. Monitoring: Once deployed, it's crucial to monitor your model to ensure it maintains performance over time and to check if it's still relevant for the data it's receiving. Monitoring can also help you detect when the model might need retraining.

1. Retraining: As new data becomes available, you might retrain your model periodically with the new data to keep it up to date. This is especially important if the underlying data distribution changes over time (a phenomenon known as concept drift).

1. Versioning: You should version control your model like you would with code. This means saving new versions of the model each time you retrain, so you can roll back to a previous version if necessary.

1. Documentation: Documenting your model's performance metrics, the hyperparameters used, and any peculiarities noted during training/testing is vital for reproducibility and for future reference.

1. Model Analysis: Sometimes, after deploying a model, you'll want to further analyze what kind of predictions it's making. Techniques like a confusion matrix, ROC curve analysis, or feature importance analysis can provide insight into how your model is operating.

1. Feedback Loop: In many machine learning systems, you'll set up a feedback loop where the model's predictions are evaluated by users or domain experts, and their feedback is used to further improve the model.

1. Load the Model: When you need to make predictions, you will load the model using joblib.load and then call its predict or predict_proba methods.



In [15]:
# Load the saved model
loaded_model = joblib.load('best_logistic_model.pkl')

# Predict on new data
new_data = np.array([[5.9, 3.0, 5.1, 1.8]])  # Replace this with new data
prediction = loaded_model.predict(new_data)
print(f"The predicted class for the new data is: {prediction}")

The predicted class for the new data is: [2]
