In [33]:
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV

In [2]:
df=sns.load_dataset('iris')
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [3]:
df['species'].unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

### BINARY CLASSIFICATION

In [4]:
df=df[df['species']!='setosa']

In [5]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
50,7.0,3.2,4.7,1.4,versicolor
51,6.4,3.2,4.5,1.5,versicolor
52,6.9,3.1,4.9,1.5,versicolor
53,5.5,2.3,4.0,1.3,versicolor
54,6.5,2.8,4.6,1.5,versicolor


In [6]:
df['species']=df['species'].map({'versicolor':0,'virginica':1})

In [7]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
50,7.0,3.2,4.7,1.4,0
51,6.4,3.2,4.5,1.5,0
52,6.9,3.1,4.9,1.5,0
53,5.5,2.3,4.0,1.3,0
54,6.5,2.8,4.6,1.5,0


Whether flower is virginica or not?

In [8]:
X = df.drop(['species'],axis=1)
y = df['species']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3,random_state=42)

In [10]:
X_train.shape

(70, 4)

In [11]:
model=LogisticRegression()

In [12]:
model.fit(X_train,y_train)

In [13]:
model.predict(X_test)

array([1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 1, 0, 1], dtype=int64)

In [14]:
model.score(X_test,y_test)

0.9333333333333333

In [15]:
X_test.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
133,6.3,2.8,5.1,1.5
103,6.3,2.9,5.6,1.8
120,6.9,3.2,5.7,2.3
95,5.7,3.0,4.2,1.2
94,5.6,2.7,4.2,1.3


In [16]:
model.predict_proba(X_test)

array([[0.45661539, 0.54338461],
       [0.10864404, 0.89135596],
       [0.04365498, 0.95634502],
       [0.9452204 , 0.0547796 ],
       [0.92043386, 0.07956614],
       [0.9456342 , 0.0543658 ],
       [0.55136873, 0.44863127],
       [0.03137488, 0.96862512],
       [0.98823157, 0.01176843],
       [0.82269875, 0.17730125],
       [0.75104396, 0.24895604],
       [0.97728003, 0.02271997],
       [0.42389967, 0.57610033],
       [0.37321513, 0.62678487],
       [0.04182879, 0.95817121],
       [0.77614273, 0.22385727],
       [0.49836177, 0.50163823],
       [0.45104676, 0.54895324],
       [0.96924801, 0.03075199],
       [0.98582108, 0.01417892],
       [0.00641058, 0.99358942],
       [0.51220217, 0.48779783],
       [0.72864048, 0.27135952],
       [0.96085659, 0.03914341],
       [0.42235438, 0.57764562],
       [0.90163725, 0.09836275],
       [0.8837622 , 0.1162378 ],
       [0.28968429, 0.71031571],
       [0.95022136, 0.04977864],
       [0.00553598, 0.99446402]])

In [17]:
y_pred = model.predict(X_test)

In [18]:
confusion_matrix(y_test,y_pred)

array([[16,  1],
       [ 1, 12]], dtype=int64)

In [19]:
lgreg=LogisticRegression()

In [26]:
# GRidSearchCV - Improve model through hypoparameters to remove overfitting problem. Used to give best hyperpatrametr to 
# the model instead of manually giving L1 in one line than L2
# Hyper parameters(penalty and all) are used to improve model accuracy and effciency while parameters helps in 
# prediction, used to train a model(sepal_width and all)

parameter = {'penalty':['l1','l2'],'C':[1,2.0,3.0,4.0,5.0],'max_iter':[100,200,300,400,500,600]}
# by default uses l2 regularization
# l1 and l2 are max likelihood function/ regularization technique
# max_iter we give values to reach local minima
# C are penalty rates, change them to get best values

In [27]:
grid_class=GridSearchCV(lgreg,param_grid=parameter,scoring='accuracy',cv=5)
# Grid search CV - used to give combination of hyperparameters rather than 1. 
# Used to find optimal values by hyperparameter tuning
# its okay if you dont give CV

In [28]:
grid_class.fit(X_train,y_train)

150 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
150 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Lenovo\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Lenovo\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\Lenovo\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

        nan 0.97142857 

In [29]:
print(grid_class.best_params_)

{'C': 1, 'max_iter': 100, 'penalty': 'l2'}


In [30]:
print(grid_class.best_score_)

0.9714285714285715


In [31]:
y_pred=grid_class.predict(X_test)

### MULTICLASS CLASSIFICATION

In [51]:
df1=sns.load_dataset('iris')
df1.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [52]:
df1['species']=df1['species'].map({'versicolor':0,'virginica':1,'setosa':2})

In [53]:
XX = df1.drop(['species'],axis=1)
yy = df1['species']

In [54]:
XX_train, XX_test, yy_train, yy_test = train_test_split(XX, yy, test_size = 0.3,random_state=42)

In [55]:
XX_train.shape

(105, 4)

In [56]:
model=LogisticRegression()

In [57]:
model.fit(XX_train,yy_train)

In [58]:
model.predict(XX_test)

array([0, 2, 1, 0, 0, 2, 0, 1, 0, 0, 1, 2, 2, 2, 2, 0, 1, 0, 0, 1, 2, 1,
       2, 1, 1, 1, 1, 1, 2, 2, 2, 2, 0, 2, 2, 1, 0, 2, 2, 2, 1, 0, 0, 2,
       2], dtype=int64)

In [59]:
model.score(XX_test,yy_test)

1.0

In [60]:
XX_test.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
73,6.1,2.8,4.7,1.2
18,5.7,3.8,1.7,0.3
118,7.7,2.6,6.9,2.3
78,6.0,2.9,4.5,1.5
76,6.8,2.8,4.8,1.4


In [61]:
model.predict_proba(XX_test)

array([[8.12291178e-01, 1.83606736e-01, 4.10208642e-03],
       [5.79663777e-02, 5.80105033e-07, 9.42033042e-01],
       [2.09418668e-03, 9.97905797e-01, 1.58665132e-08],
       [7.73233773e-01, 2.19940972e-01, 6.82525506e-03],
       [7.51298153e-01, 2.47106340e-01, 1.59550636e-03],
       [4.99073496e-02, 4.58693716e-07, 9.50092192e-01],
       [9.04139542e-01, 1.80556712e-02, 7.78047866e-02],
       [1.47492440e-01, 8.52327659e-01, 1.79901334e-04],
       [7.80608953e-01, 2.16967314e-01, 2.42373284e-03],
       [9.40807075e-01, 3.01839271e-02, 2.90089976e-02],
       [2.25322602e-01, 7.74203912e-01, 4.73486419e-04],
       [3.72637051e-02, 1.84136754e-07, 9.62736111e-01],
       [3.07945944e-02, 9.66015328e-08, 9.69205309e-01],
       [4.39404973e-02, 2.65387583e-07, 9.56059237e-01],
       [2.34553715e-02, 1.89583953e-07, 9.76544439e-01],
       [6.63333544e-01, 3.31899594e-01, 4.76686200e-03],
       [2.67291369e-02, 9.73261194e-01, 9.66925720e-06],
       [9.45150660e-01, 2.71499

In [62]:
yy_pred = model.predict(XX_test)

In [63]:
confusion_matrix(yy_test,yy_pred)

array([[13,  0,  0],
       [ 0, 13,  0],
       [ 0,  0, 19]], dtype=int64)

In [77]:
lgreg1=LogisticRegression(multi_class='ovr',solver='liblinear')

In [78]:
parameters = {'penalty':['l1','l2'],'C':[1,2.0,3.0,4.0,5.0],'max_iter':[100,200,300,400,500,600]}

In [79]:
grid_class1=GridSearchCV(lgreg1,param_grid=parameters,scoring='accuracy',cv=5)

In [80]:
grid_class1.fit(XX_train,yy_train)



In [81]:
print(grid_class1.best_params_)

{'C': 1, 'max_iter': 100, 'penalty': 'l1'}


In [82]:
print(grid_class1.best_score_)

0.9428571428571428
