In [1]:
import pandas as pd

# 创建一个包含类别数据的数据框
data = {'color': ['red', 'green', 'blue', 'red', 'green']}
df = pd.DataFrame(data)

# 查看原始数据框
print('Original DataFrame:')
print(df)

# 使用 pandas 的 get_dummies 函数进行独热编码
df_encoded = pd.get_dummies(df)

# 查看编码后的数据框
print('Encoded DataFrame:')
print(df_encoded)

Original DataFrame:
   color
0    red
1  green
2   blue
3    red
4  green
Encoded DataFrame:
   color_blue  color_green  color_red
0           0            0          1
1           0            1          0
2           1            0          0
3           0            0          1
4           0            1          0


In [2]:
pip install scikit-learn

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple/, http://mirrors.aliyun.com/pypi/simple/, http://pypi.douban.com/simple, http://pypi.mirrors.ustc.edu.cn/simple/Note: you may need to restart the kernel to use updated packages.



In [3]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
import numpy as np

# 生成一些样本数据
np.random.seed(0)
X = np.random.randn(100, 1)
y = 3 * X.squeeze() + 2 + np.random.randn(100)

# 创建一个线性回归模型
model = LinearRegression()

# 使用交叉验证来估计模型在新数据上的表现
scores = cross_val_score(model, X, y, cv=5)

# 输出结果
print(scores)

[0.7661709  0.95014867 0.89169925 0.85918076 0.87250909]


In [2]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# 加载数据集
iris = load_iris()
X = iris.data
y = iris.target

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y)

# 调整参数并训练模型
clf = LogisticRegression(fit_intercept=True, C=1e15)
clf.fit(X_train, y_train)

# 预测结果并评估模型性能
y_pred = clf.predict(X_test)
score = clf.score(X_test, y_test)
print('Accuracy:', score)

Accuracy: 0.9473684210526315


In [6]:
y_pred

array([2, 2, 2, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 2, 0, 2, 1, 2, 2, 2, 1,
       1, 2, 0, 1, 0, 0, 2, 0, 0, 2, 1, 1, 1, 2, 1, 1])

In [7]:
# 极大似然估计法求解逻辑回归模型中的参数

import numpy as np
from scipy.optimize import minimize

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def likelihood_function(beta, X, y):
    p = sigmoid(np.dot(X, beta))
    return -np.sum(y * np.log(p) + (1 - y) * np.log(1 - p))

# 生成模拟数据
np.random.seed(0)
N = 100
X = np.random.randn(N, 2)
beta_true = np.array([1.0, -2.0])
y = (np.random.rand(N) < sigmoid(np.dot(X, beta_true))).astype(int)

# 求解参数
beta_initial = np.zeros(X.shape[1])
result = minimize(likelihood_function, beta_initial, args=(X,y), method='BFGS')
beta_hat = result.x

print("True parameters:", beta_true)
print("Estimated parameters:", beta_hat)

True parameters: [ 1. -2.]
Estimated parameters: [ 0.92896407 -1.7705747 ]


In [8]:
# 使用sklearn库中的LogisticRegression类进行逻辑回归
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# 加载数据
iris = load_iris()
X = iris.data
y = iris.target

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y)

# 建立模型并拟合数据
clf = LogisticRegression()
clf.fit(X_train, y_train)

# 评估模型性能
print('Training score:', clf.score(X_train, y_train))
print('Testing score:', clf.score(X_test, y_test))

Training score: 0.9642857142857143
Testing score: 1.0


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [9]:
y_train

array([1, 0, 0, 2, 0, 0, 1, 0, 2, 1, 0, 1, 1, 1, 2, 2, 2, 2, 0, 2, 2, 2,
       2, 1, 1, 0, 2, 2, 0, 0, 2, 2, 0, 0, 2, 0, 0, 0, 2, 0, 1, 1, 2, 2,
       2, 1, 0, 2, 0, 1, 1, 2, 0, 1, 0, 1, 0, 1, 1, 1, 2, 1, 0, 2, 0, 1,
       2, 1, 2, 1, 0, 0, 2, 2, 1, 1, 0, 2, 1, 2, 1, 2, 2, 1, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 1, 1, 2, 2, 0, 1, 0, 1, 1, 1, 0, 2, 0, 1, 1, 2, 0,
       2, 2])

In [10]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression

X, y = load_iris(return_X_y=True)
clf = LogisticRegression(random_state=0).fit(X, y)
print(clf.predict(X[:2, :]))
print(clf.predict_proba(X[:2, :]))

[0 0]
[[9.81810819e-01 1.81891663e-02 1.44226885e-08]
 [9.71750652e-01 2.82493178e-02 3.01583887e-08]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression

# 加载数据集
X, y = load_iris(return_X_y=True)

# 创建逻辑回归模型
clf = LogisticRegression(random_state=0, multi_class='ovr')

# 训练模型
clf.fit(X, y)

# 预测新数据
print(clf.predict([[5.0, 3.5, 1.3, 0.3]]))

[0]


In [12]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression

# 加载数据集
X, y = load_iris(return_X_y=True)

# 创建逻辑回归模型
clf = LogisticRegression(random_state=0, multi_class='multinomial')

# 训练模型
clf.fit(X, y)

# 预测新数据
print(clf.predict([[5.0, 3.5, 1.3, 0.3]]))

[0]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [22]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

# 训练数据
X_train = ['这是一篇关于体育的文章', '这是一篇关于政治的文章', '这是一篇关于科技的文章']
y_train = ['体育', '政治', '科技']

# 测试数据
X_test = ['这是一篇关于政治的文章']

# 创建管道，包括特征提取和分类器
text_clf = Pipeline([('vect', CountVectorizer()), ('clf', MultinomialNB())])

# 训练模型
text_clf.fit(X_train, y_train)

# 预测测试数据类别
predicted = text_clf.predict(X_test)
print(predicted) # 输出: ['政治']

['政治']


In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris

iris = load_iris()
X = iris.data
y = iris.target

clf = RandomForestClassifier(n_estimators=10)
clf.fit(X, y)

print(clf.predict([[5.0, 3.6, 1.4, 0.2]]))

[0]


In [25]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_classification

# 生成一些数据
X, y = make_classification(n_features=4, n_redundant=0, n_informative=2,
                           random_state=1, n_clusters_per_class=1)

# 创建并训练 AdaBoost 分类器
clf = AdaBoostClassifier(n_estimators=100)
clf.fit(X, y)

# 对新数据进行预测
print(clf.predict([[0, 0, 0, 0]]))

[1]


In [26]:
X

array([[-1.30653407e+00, -1.85064099e+00,  8.95623122e-01,
         7.63804802e-02],
       [-1.67419581e+00,  1.13872896e+00,  8.57296491e-01,
         8.25029824e-01],
       [-1.89148284e-03,  1.06821044e+00,  7.73703613e-01,
        -1.39662042e+00],
       [-2.79308500e+00,  1.10863359e+00,  1.16755486e+00,
         1.93752881e+00],
       [-3.44987210e-01,  7.71514409e-01,  3.45715997e-01,
        -2.30839743e-01],
       [-7.01344426e-01, -1.11469020e+00,  1.13545112e+00,
        -5.37223024e-01],
       [-2.36436952e-01, -9.33557290e-01,  9.97855163e-01,
         7.27813500e-01],
       [-4.75372875e-01, -8.73535822e-01,  8.71225789e-01,
         4.77610182e-01],
       [-3.53679249e-02, -1.47533915e+00,  8.61462558e-01,
         2.11060505e+00],
       [-1.23685338e+00,  9.64350564e-01,  1.77547698e+00,
         8.75838928e-01],
       [ 3.67231814e-01, -1.01366961e+00,  4.73307772e-01,
         1.23289919e+00],
       [ 1.55880554e+00,  1.08480038e+00,  1.64661853e+00,
      

In [27]:
y

array([1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0,
       0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1])