In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [17]:
df = sns.load_dataset('iris')
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [18]:
##Checking all the Unique classes
df['species'].unique()
# df['species'].value_counts() we can also do this with this

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [19]:
#Checking for any null values
df.isnull().any(axis=0)

sepal_length    False
sepal_width     False
petal_length    False
petal_width     False
species         False
dtype: bool

In [20]:
## Since Logistic regression works for binary classification and we have here more than to classes as we can see above 
# So lets remove all the rows containing setosa calass and then rest the index
df = df[df['species']!='setosa']
df.head()


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
50,7.0,3.2,4.7,1.4,versicolor
51,6.4,3.2,4.5,1.5,versicolor
52,6.9,3.1,4.9,1.5,versicolor
53,5.5,2.3,4.0,1.3,versicolor
54,6.5,2.8,4.6,1.5,versicolor


In [21]:
##Encoding our species Classes 
##Versicolor = 0
##virginica = 1

##First Method -- df['species'] = np.where(df['species'].str.contains('versicolor'),1,0)

##Second Method
df['species'] = df['species'].map({'versicolor':0,'virginica':1})
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
50,7.0,3.2,4.7,1.4,0
51,6.4,3.2,4.5,1.5,0
52,6.9,3.1,4.9,1.5,0
53,5.5,2.3,4.0,1.3,0
54,6.5,2.8,4.6,1.5,0


In [22]:
## Splitting the dataset into Independent and Dependent Features
X = df[['sepal_length','sepal_width','petal_length','petal_width']]
Y = df['species']
## We can also do it like this
## X = df.iloc[:,:-1] means all the colunms except last colunm
## X = df.iloc[:,-1] means only the last colunms 

In [23]:
## Train-Test-Split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.25,random_state=42)


In [24]:
## Creating the Model
from sklearn.linear_model import LogisticRegression
Logistic_Regression = LogisticRegression(solver='saga',n_jobs=-1)

### Hyperparameter Tuning With GridSearchCV

##### GridSearchCV is a method in scikit-learn used for hyperparameter tuning. It performs an exhaustive search over a specified parameter grid to find the best combination of hyperparameters for a given machine learning model.

#### Key Features of GridSearchCV
##### 1.Exhaustive Search: It evaluates all possible combinations of the specified hyperparameters.

##### 2.Cross-Validation: Uses cross-validation to ensure that the selected hyperparameters generalize well to unseen data.

##### 3.Scoring Metric: Optimizes a specific scoring metric (e.g., accuracy, precision, or mean squared error).

##### 4.Parallel Processing: Can utilize multiple cores for faster computation (via the n_jobs parameter).

#### Steps Involved in GridSearchCV
##### 1.Define the machine learning model to optimize.
##### 2.Specify the hyperparameter grid to search.
##### 3.Use cross-validation to evaluate each hyperparameter combination.
##### 4.Return the best combination of hyperparameters and the corresponding model.

#### Important Parameters
##### 1.estimator: The machine learning model to tune (e.g., LogisticRegression, RandomForestClassifier).

##### 2.param_grid: Dictionary specifying the hyperparameters to tune and their possible values.

##### 3.cv:  Number of cross-validation folds (default is 5).

##### 4.scoring: Metric used for optimization (e.g., 'accuracy', 'f1', 'neg_mean_squared_error').

##### 5.n_jobs: Number of parallel jobs. -1 uses all available cores.

##### 6.refit: If True (default), refits the model using the best parameters on the entire dataset.



#### Output Attributes
##### 1.best_params_: Dictionary of the best hyperparameter values.

##### 2.best_score_: Best cross-validation score achieved.

##### 3.best_estimator_: The model instance with the best hyperparameters.

##### 4.cv_results_: Detailed results of the grid search, including scores for all parameter combinations.

In [25]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
parameter_grid = {"penalty":['l1','l2','elasticnet'],# Regularization Type
                  'C': [1,2,3,4,5,6,10,20,30,40,50],# Regularization strength
                  'max_iter':[100,200,300,500,1000], # No. of iterations to fro the optimization algorithm
                  'l1_ratio': [0.1, 0.5, 0.7, 1.0]  
                  }
# The l1_ratio parameter is used when the penalty is set to 'elasticnet' in logistic regression. It determines the mix of L1 and L2 regularization:

# l1_ratio = 0: Equivalent to L2 regularization (Ridge).
# l1_ratio = 1: Equivalent to L1 regularization (Lasso).
# 0 < l1_ratio < 1: A combination of L1 and L2 regularization.
# In essence, l1_ratio controls the balance between L1 and L2 penalties in the elastic net regularization.

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=Logistic_Regression,param_grid=parameter_grid,cv=5,scoring='accuracy',n_jobs=-1,refit=True)
## For Classification problem we use 'accuraacy' as a scoring parameter

# Perform grid search
grid_search.fit(X_train,Y_train)




In [26]:
# Get the best parameters and best score
print(f'Best Parameters: {grid_search.best_params_}')
print(f'Best Score: {grid_search.best_score_}')


Best Parameters: {'C': 1, 'l1_ratio': 0.1, 'max_iter': 200, 'penalty': 'l1'}
Best Score: 1.0


In [27]:
# Evaluate the best model on the test set
best_model = grid_search.best_estimator_

Y_Pred = best_model.predict(X_test)
test_accuracy = best_model.score(X_test, Y_test)
print("Test Accuracy:", test_accuracy)


Test Accuracy: 0.88


In [28]:
## Acuracy Score
from sklearn.metrics import accuracy_score,classification_report
score = accuracy_score(Y_test,Y_Pred)
print('Accuracy:',score)


Accuracy: 0.88


In [29]:
##Classification Report
report = classification_report(Y_test,Y_Pred)
print(report)

              precision    recall  f1-score   support

           0       1.00      0.79      0.88        14
           1       0.79      1.00      0.88        11

    accuracy                           0.88        25
   macro avg       0.89      0.89      0.88        25
weighted avg       0.91      0.88      0.88        25



In [30]:

def calculate_mean(data):
    return sum(data)/len(data)


data = [1,2,3,4,5]

mean = calculate_mean(data)
print(mean)

3.0
