In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('https://raw.githubusercontent.com/uiuc-cse/data-fa14/gh-pages/data/iris.csv')

In [32]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
sepal_length    150 non-null float64
sepal_width     150 non-null float64
petal_length    150 non-null float64
petal_width     150 non-null float64
species         150 non-null object
dtypes: float64(4), object(1)
memory usage: 5.9+ KB


In [3]:
features = data.iloc[:,:-1].values
label = data.iloc[:,-1].values

# Goal

1. To get the minimum score Threshold
2. To understand what maximum score I can achieve from the dataset
3. How to extract the best training and testing sample that can give the best score

In [4]:
#Decide what can be the best and minimum accuracy score I can expect from the data using LogisticRegressiojn

In [5]:
#1. Initialize the algo
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

In [20]:
#2. Perform Cross Validation
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model,
                        features,
                        label,
                        cv=10)
scores

array([1.        , 1.        , 1.        , 0.93333333, 0.93333333,
       0.93333333, 0.8       , 0.93333333, 1.        , 1.        ])

In [21]:
print("To get the minimum score Threshold: ",np.min(scores))
print("To understand what maximum score I can achieve from the dataset: ",np.max(scores))
print("Average Score Expected: ", np.average(scores))

To get the minimum score Threshold:  0.8
To understand what maximum score I can achieve from the dataset:  1.0
Average Score Expected:  0.9533333333333334


In [23]:
#How to extract the best training and testing sample that can give the best score
# KFold Cross Validation

#1. Initialize the algo
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

#2. Initialize KFold Method
from sklearn.model_selection import KFold
kfold = KFold(n_splits=10, #n_splits should be equal to no of cv value in cross_val_score
              random_state=1,
              shuffle=True)

#3. Initialize For Loop 

i=0
for train,test in kfold.split(features):
    i = i+1
    X_train,X_test = features[train],features[test]
    y_train,y_test = label[train],label[test]
    
    model.fit(X_train,y_train)
    
    if model.score(X_test,y_test) >= 0.95:
        print("Test Score: {}, train score: {}, for Sample Split: {}".format(model.score(X_test,y_test),model.score(X_train,y_train),i))
        

Test Score: 1.0, train score: 0.9555555555555556, for Sample Split: 1
Test Score: 1.0, train score: 0.9481481481481482, for Sample Split: 3
Test Score: 1.0, train score: 0.9629629629629629, for Sample Split: 5
Test Score: 1.0, train score: 0.9629629629629629, for Sample Split: 7
Test Score: 1.0, train score: 0.9555555555555556, for Sample Split: 9


In [28]:
#Hack -- To extract the sample
from sklearn.model_selection import KFold
kfold = KFold(n_splits=10, #n_splits should be equal to no of cv value in cross_val_score
              random_state=1,
              shuffle=True)
i=0
for train,test in kfold.split(features):
    i = i+1
    if i == 5:
        X_train,X_test,y_train,y_test = features[train],features[test],label[train],label[test]

In [29]:
#Final Model To Deploy
from sklearn.linear_model import LogisticRegression
finalModel = LogisticRegression()
finalModel.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [30]:
finalModel.score(X_train,y_train)

0.9629629629629629

In [31]:
finalModel.score(X_test,y_test)

1.0

In [33]:
#StratifiedShuffleSplit
#How to extract the best training and testing sample that can give the best score with control over test_size parameter
# KFold Cross Validation

#1. Initialize the algo
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

#2. Initialize StratifiedShuffleSplit Method
from sklearn.model_selection import StratifiedShuffleSplit
ss = StratifiedShuffleSplit(n_splits=10, #n_splits should be equal to no of cv value in cross_val_score
              random_state=1,
              test_size=0.3)

#3. Initialize For Loop 

i=0
for train,test in ss.split(features,label):
    i = i+1
    X_train,X_test = features[train],features[test]
    y_train,y_test = label[train],label[test]
    
    model.fit(X_train,y_train)
    
    if model.score(X_test,y_test) >= 0.95:
        print("Test Score: {}, train score: {}, for Sample Split: {}".format(model.score(X_test,y_test),model.score(X_train,y_train),i))
        

Test Score: 0.9777777777777777, train score: 0.9523809523809523, for Sample Split: 1
Test Score: 0.9555555555555556, train score: 0.9523809523809523, for Sample Split: 4
Test Score: 0.9777777777777777, train score: 0.9523809523809523, for Sample Split: 5
Test Score: 0.9777777777777777, train score: 0.9523809523809523, for Sample Split: 6
Test Score: 0.9777777777777777, train score: 0.9619047619047619, for Sample Split: 7
Test Score: 0.9777777777777777, train score: 0.9523809523809523, for Sample Split: 8


In [34]:
#Hack -- To extract the sample
from sklearn.model_selection import StratifiedShuffleSplit
ss = StratifiedShuffleSplit(n_splits=10, #n_splits should be equal to no of cv value in cross_val_score
              random_state=1,
              test_size=0.3)
i=0
for train,test in ss.split(features,label):
    i = i+1
    if i == 7:
        X_train,X_test,y_train,y_test = features[train],features[test],label[train],label[test]

In [None]:
#Create final Deployable model

In [None]:
#Use StratifiedShuffleSplit when you need to define the test_size for the dataset

In [35]:
from sklearn.model_selection import train_test_split
for i in range(1,151):
    X_train,X_test,y_train,y_test = train_test_split(features,
                                                    label,
                                                    test_size=0.3,
                                                    random_state=i)
    model = LogisticRegression()
    model.fit(X_train,y_train)
    
    if model.score(X_test,y_test) == 1:
        print("Test: {} Train: {} RS: {}".format(model.score(X_test,y_test),model.score(X_train,y_train),i))
    

Test: 1.0 Train: 0.9619047619047619 RS: 3
Test: 1.0 Train: 0.9619047619047619 RS: 9
Test: 1.0 Train: 0.9523809523809523 RS: 19
Test: 1.0 Train: 0.9523809523809523 RS: 23
Test: 1.0 Train: 0.9619047619047619 RS: 33
Test: 1.0 Train: 0.9523809523809523 RS: 39
Test: 1.0 Train: 0.9428571428571428 RS: 44
Test: 1.0 Train: 0.9619047619047619 RS: 65
Test: 1.0 Train: 0.9619047619047619 RS: 68
Test: 1.0 Train: 0.9428571428571428 RS: 100
Test: 1.0 Train: 0.9523809523809523 RS: 107
Test: 1.0 Train: 0.9619047619047619 RS: 120
Test: 1.0 Train: 0.9428571428571428 RS: 131
Test: 1.0 Train: 0.9523809523809523 RS: 140
Test: 1.0 Train: 0.9428571428571428 RS: 143
Test: 1.0 Train: 0.9428571428571428 RS: 144
