<a href="https://www.kaggle.com/code/shreeyashah/iris-dataset-logisticregression?scriptVersionId=283041033" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Logistic Regression with Hyperparameter Tuning

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Importing the

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

## Importing the data

In [None]:
df = pd.read_csv('/kaggle/input/iris/Iris.csv')
df.head()

In [None]:
df = df.iloc[:,1:]
df.head()

In [None]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

## Exploratory data analysis

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isna().sum()

In [None]:
df.duplicated().sum()

In [None]:
#Boxplot to check for outliers
sns.boxplot(data=df, palette='PuRd')
plt.title('Outlier Detection')
plt.show()

In [None]:
sns.countplot(x='Species', data = df, palette='PuRd')
plt.title('Distribution of Species')
plt.show()

In [None]:
sns.pairplot(data=df,hue='Species', palette = 'PuRd')
plt.title('Pair Plot')
plt.show()

## Encoding the output column

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

## Spliting data into training and test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 1,stratify=y)

## Training the model

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [None]:
param_grid = [ 
    {
        'penalty': ['l2'],
        'solver' : ['lbfgs', 'newton-cg', 'sag', 'saga'],
        'C'      : [0.01, 0.1, 1, 10]
    },
    {
        'penalty': ['l1'],
        'solver' : ['saga'],
        'C'      : [0.01, 0.1, 1, 10],
    },
    {
        'penalty': ['elasticnet'],
        'solver' : ['saga'],
        'C'      : [0.01, 0.1, 1, 10],
        'l1_ratio': [0.2, 0.5, 0.8]
    },
    {
        'penalty': [None],
        'solver' : ['lbfgs', 'newton-cg', 'sag', 'saga'] ,
        'C'      : [1.0]
    }]

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

base_clf = LogisticRegression(multi_class='multinomial', max_iter=5000)

gs = GridSearchCV(
    estimator=base_clf,
    param_grid=param_grid,
    cv=cv,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1,
    refit=True
)

gs.fit(X_train, y_train)

print("Best CV accuracy:", round(gs.best_score_, 4))
print("Best params:", gs.best_params_)

In [None]:
best_clf = gs.best_estimator_
y_pred = best_clf.predict(X_test)

print("\nTest accuracy:", round(accuracy_score(y_test, y_pred), 4))
print("\nClassification report:\n")
print(classification_report(y_test, y_pred, target_names=le.classes_))

In [None]:
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='PuRd', xticklabels=le.classes_, yticklabels=le.classes_)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

## Conclusion

**Logistic regression 'C': 10, 'penalty': 'l2', 'solver': 'saga' gives the best results for this dataset**