In [22]:
import numpy as np
import pandas as pd

In [23]:
from sklearn.datasets import load_iris

In [24]:
data = load_iris()

In [25]:
for lines in data['DESCR'].split('\n'):
    print(lines)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

:Number of Instances: 150 (50 in each of three classes)
:Number of Attributes: 4 numeric, predictive attributes and the class
:Attribute Information:
    - sepal length in cm
    - sepal width in cm
    - petal length in cm
    - petal width in cm
    - class:
            - Iris-Setosa
            - Iris-Versicolour
            - Iris-Virginica

:Summary Statistics:

                Min  Max   Mean    SD   Class Correlation
sepal length:   4.3  7.9   5.84   0.83    0.7826
sepal width:    2.0  4.4   3.05   0.43   -0.4194
petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

:Missing Attribute Values: None
:Class Distribution: 33.3% for each of 3 classes.
:Creator: R.A. Fisher
:Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
:Date: July, 1988

The famous Iris database, first used by Sir R.A. Fisher. The dataset is taken
from Fis

In [26]:
x = data['data']

y = data['target']

In [27]:
data.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [28]:
df = pd.DataFrame(x,columns = data.feature_names)
df.head(10)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
5,5.4,3.9,1.7,0.4
6,4.6,3.4,1.4,0.3
7,5.0,3.4,1.5,0.2
8,4.4,2.9,1.4,0.2
9,4.9,3.1,1.5,0.1


In [29]:
df['Class'] = y
df.head(10)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Class
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
5,5.4,3.9,1.7,0.4,0
6,4.6,3.4,1.4,0.3,0
7,5.0,3.4,1.5,0.2,0
8,4.4,2.9,1.4,0.2,0
9,4.9,3.1,1.5,0.1,0


In [31]:
for i in df.columns:
    q1 = np.quantile(df[i],0.25)
    q3 = np.quantile(df[i],0.75)

    iqr = q3-q1

    lb = q1 -  iqr*1.5
    ub = q3 +  iqr*1.5

    outlair = np.where(((df[i]<lb)|(df[i]>ub)))
    for index in outlair[0]:
        df[i].iloc[index]=None

df.head()


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df[i].iloc[index]=None
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[i].iloc[index]=None
You are setting v

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Class
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [32]:
df.isna().sum()

sepal length (cm)    0
sepal width (cm)     4
petal length (cm)    0
petal width (cm)     0
Class                0
dtype: int64

In [33]:
df.dropna(inplace=True)

In [34]:
df.isna().sum()

sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
Class                0
dtype: int64

In [35]:
x = df.iloc[:,0:-1]
y = df.iloc[:,-1]

In [78]:
from sklearn.model_selection import train_test_split as spliter

x_train,x_test,y_train,y_test  = spliter(x,y,train_size=0.8,random_state=100)

In [79]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

model.fit(x_train,y_train)

In [80]:
y_train_pred = model.predict(x_train)
y_test_pred = model.predict(x_test)

In [81]:
from sklearn.metrics import confusion_matrix,accuracy_score

print("Training Evaluation")
print(confusion_matrix(y_train,y_train_pred))
print(accuracy_score(y_train,y_train_pred))

Training Evaluation
[[36  0  0]
 [ 0 38  2]
 [ 0  1 39]]
0.9741379310344828


In [82]:
print("Testing Evaluation")
print(confusion_matrix(y_test,y_test_pred))
print(accuracy_score(y_test,y_test_pred))

Testing Evaluation
[[11  0  0]
 [ 0  8  1]
 [ 0  0 10]]
0.9666666666666667


In [83]:
import pickle

pickle.dump(model,open('iris_model.pkl','wb'))