## Load data

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('./data/iris-data.csv')

In [3]:
df.head()

Unnamed: 0,sepal_length_cm,sepal_width_cm,petal_length_cm,petal_width_cm,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [4]:
df.describe()

Unnamed: 0,sepal_length_cm,sepal_width_cm,petal_length_cm,petal_width_cm
count,150.0,150.0,150.0,145.0
mean,5.644627,3.054667,3.758667,1.236552
std,1.312781,0.433123,1.76442,0.755058
min,0.055,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.4
50%,5.7,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
sepal_length_cm    150 non-null float64
sepal_width_cm     150 non-null float64
petal_length_cm    150 non-null float64
petal_width_cm     145 non-null float64
class              150 non-null object
dtypes: float64(4), object(1)
memory usage: 5.9+ KB


In [6]:
# Remove null value
df = df.dropna()

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 145 entries, 0 to 149
Data columns (total 5 columns):
sepal_length_cm    145 non-null float64
sepal_width_cm     145 non-null float64
petal_length_cm    145 non-null float64
petal_width_cm     145 non-null float64
class              145 non-null object
dtypes: float64(4), object(1)
memory usage: 6.8+ KB


## Label Encoding

In [8]:
import seaborn as sns
import matplotlib.pyplot as plt

In [9]:
df['class'].value_counts()

Iris-virginica     50
Iris-versicolor    45
Iris-setosa        44
versicolor          5
Iris-setossa        1
Name: class, dtype: int64

In [10]:
df['class'] = df['class'].replace(['versicolor', 'Iris-setossa'], ['Iris-versicolor', 'Iris-setosa'])

In [11]:
df['class'].value_counts()

Iris-virginica     50
Iris-versicolor    50
Iris-setosa        45
Name: class, dtype: int64

In [12]:
df['class'] = df['class'].replace(['Iris-versicolor', 'Iris-virginica', 'Iris-setosa'], [1, 2, 3])

In [13]:
df['class'].value_counts()

2    50
1    50
3    45
Name: class, dtype: int64

## Build Model

In [14]:
X = df.iloc[:,:-1]
y = df['class']

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print('Total Data：{}，Train Data：{}，Test Data：{}'.format(len(X), len(X_train), len(X_test)))

Total Data：145，Train Data：116，Test Data：29


In [17]:
from sklearn.linear_model import LogisticRegression
import warnings

warnings.filterwarnings('ignore')

In [18]:
lr_model = LogisticRegression()

In [19]:
lr_model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [20]:
print(lr_model.intercept_, lr_model.coef_)

[ 1.4554607  -1.94265998  0.21477583] [[-0.24901748 -0.83125664  0.98114208 -1.63182733]
 [-0.07419992 -2.32858526  0.94885744  2.70297158]
 [ 0.42012122  1.30132264 -2.10957009 -0.96271683]]


In [21]:
acc = lr_model.score(X_test, y_test)
print(acc)

0.896551724137931
