In [12]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# 1)Loading dataset

In [13]:
df = sns.load_dataset('iris')

In [14]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


# 2) Observing Statistical Analysis of Dataset

In [16]:
df.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [19]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df['species'] = encoder.fit_transform(df['species'])

In [20]:
# Seggragting the independent and Dependent Features
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [21]:
X.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [22]:
y.head(2)

0    0
1    0
Name: species, dtype: int64

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# 3) Seggregating the Categorical and numerical features

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    int64  
dtypes: float64(4), int64(1)
memory usage: 6.0 KB


In [25]:
categorical_col = []
numerical_col = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']


# Feature Engineering Automation

In [27]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer         # Handling Missing Values
from sklearn.preprocessing import StandardScaler # Standard Scaling the Data
from sklearn.preprocessing import OneHotEncoder  # helps in Categorical to numerical
from sklearn.compose import ColumnTransformer    # Dividing the categorical and numerical ffeatures

num_pipeline = Pipeline(
    steps= [
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]
)

cat_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')), # Helps in handling missing values presenr in the dataset
        ('encoder', OneHotEncoder()),  # Helps in Feature Scaling
    ]
)

#Column Transformer
preprocessor = ColumnTransformer(
    [
        ('num_pipeline',num_pipeline,numerical_col),
        ('cat_pipeline',cat_pipeline,categorical_col)
    ]
)

In [28]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

# 5) Model Training Automation

In [33]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

models = {
    'RandomForestClassifier' : RandomForestClassifier(),
    'LogisticRegression' : LogisticRegression(),
    'SVC' : SVC(),
    'DecisionTreeClassifier' : DecisionTreeClassifier()
}

# 6) Model Evaluation

In [34]:
from sklearn.metrics import accuracy_score
def evaluate_model(X_train, X_test, y_train, y_test, models):
  report={}
  for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report[list(models.keys())[i]] = accuracy
  return report

In [35]:
evaluate_model(X_train, X_test, y_train, y_test, models)

{'RandomForestClassifier': 1.0,
 'LogisticRegression': 1.0,
 'SVC': 1.0,
 'DecisionTreeClassifier': 1.0}

# Observation :-
#### Here , we clearly see that the accuracy of the models is 100% accurate which is not a good condition that mean our models has leads an overfitting condition .

####Let's Resolve these issue by performing Hyperparameter Tunning

# 6) Hyperparameter Tunning

In [36]:
classifier = RandomForestClassifier()
classifier.fit(X_train,y_train)

In [37]:
params = {
    'max_depth' : [3,5,10,None],
    'n_estimators' : [10,50,100,200,300],
    'criterion' : ['gini', 'entropy'],


}

In [38]:
from sklearn.model_selection import RandomizedSearchCV
cv = RandomizedSearchCV(classifier, param_distributions = params, cv = 5, verbose=3)
cv.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END criterion=entropy, max_depth=5, n_estimators=50;, score=0.958 total time=   0.2s
[CV 2/5] END criterion=entropy, max_depth=5, n_estimators=50;, score=0.958 total time=   0.1s
[CV 3/5] END criterion=entropy, max_depth=5, n_estimators=50;, score=0.833 total time=   0.2s
[CV 4/5] END criterion=entropy, max_depth=5, n_estimators=50;, score=1.000 total time=   0.1s
[CV 5/5] END criterion=entropy, max_depth=5, n_estimators=50;, score=0.958 total time=   0.1s
[CV 1/5] END criterion=entropy, max_depth=5, n_estimators=200;, score=0.958 total time=   0.6s
[CV 2/5] END criterion=entropy, max_depth=5, n_estimators=200;, score=0.958 total time=   0.5s
[CV 3/5] END criterion=entropy, max_depth=5, n_estimators=200;, score=0.833 total time=   0.6s
[CV 4/5] END criterion=entropy, max_depth=5, n_estimators=200;, score=1.000 total time=   0.3s
[CV 5/5] END criterion=entropy, max_depth=5, n_estimators=200;, score=0.958 total time=  

In [39]:
cv.best_params_

{'n_estimators': 50, 'max_depth': None, 'criterion': 'gini'}

In [40]:
classifier2 = RandomForestClassifier(criterion = 'entropy', max_depth = 5, n_estimators = 200)
classifier2.fit(X_train, y_train)

In [41]:
from sklearn.metrics import accuracy_score
y_pred = classifier.predict(X_test)
accuracy_score(y_test, y_pred)

1.0

# Conclusion :
### At last by performing the Hyperparameter tunning the model accuaracy is still 100 % , so that's our model is an accurate for the prediction