In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
sns.set()
colors = sns.color_palette("husl")

## 基本使用

In [2]:
X = np.random.random(size=(10,4))
y = np.random.randint(0,2,size=10)

In [3]:
from sklearn.linear_model import LogisticRegression

In [4]:
lr = LogisticRegression()
lr.fit(X,y)

LogisticRegression()

### 特征系数

In [5]:
lr.coef_

array([[ 0.08433738, -0.72127619,  0.52129521,  0.34242893]])

In [6]:
lr.predict(X)

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0])

### 预测结果的概率

In [7]:
lr.predict_proba(X)

array([[0.63265013, 0.36734987],
       [0.52561221, 0.47438779],
       [0.47392214, 0.52607786],
       [0.63314558, 0.36685442],
       [0.58184723, 0.41815277],
       [0.71382006, 0.28617994],
       [0.55565724, 0.44434276],
       [0.58888641, 0.41111359],
       [0.65236125, 0.34763875],
       [0.64209985, 0.35790015]])

In [8]:
threshold = 0.5
(lr.predict_proba(X)[:,1] > threshold).astype(np.int8)

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0], dtype=int8)

## 手写数字识别

In [9]:
from sklearn.datasets import load_digits
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

In [10]:
digits = load_digits()

data = digits.data
target = digits.target

In [11]:
X_train,X_test,y_train,y_test = train_test_split(data,target,test_size=0.2,random_state=1)

In [12]:
lr = LogisticRegression(max_iter=10000)
knn = KNeighborsClassifier(weights="distance")
lr_y_ = cross_val_score(lr,data,target,cv=5)
knn_y_ = cross_val_score(knn,data,target,cv=5)
lr.fit(X_train,y_train)

LogisticRegression(max_iter=10000)

In [13]:
print(f"lr:{lr_y_}\nknn:{knn_y_}\ntype:{type(lr_y_)}")
pd.DataFrame(data={
    "LR":lr,
    "KNN":knn
},index = [0,1,2,3,4]).plot()

lr:[0.925      0.875      0.93871866 0.93314763 0.89693593]
knn:[0.95277778 0.95555556 0.96657382 0.98050139 0.96100279]
type:<class 'numpy.ndarray'>


TypeError: no numeric data to plot

### 调优

In [None]:
# C
lr1 = LogisticRegression(penalty="l2",C=0.1,max_iter=10000)
lr1.fit(X_train,y_train)
print(f"lr:{lr.score(X_test,y_test)}\nlr1:{lr1.score(X_test,y_test)}")

### 网格搜索调参

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
lr = LogisticRegression(max_iter=1000)

In [None]:
# 要搜索的参数的集合
parm_grid = {
    "penalty":["l1","l2"],
    "C":[0.1,0.5,1,5,10]
}

# 构造网格搜索对象
gscv = GridSearchCV(estimator=lr,param_grid=parm_grid,cv=5)

# 开始搜索
gscv.fit(data,target)

In [None]:
# 获取最best_params_(gscv.best_params_)
print(gbest_estimator_t_estimator_)
print(gscv.predict(X_test))

In [None]:
best_model = gscv.best_estimator_
best_model.fit(X_train,y_train)
best_model.score(X_test,y_test)