## 有监督学习和无监督学习
### t-SNE分解

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import datasets
from sklearn import manifold
%matplotlib inline

In [None]:
data = datasets.fetch_openml('mnist_784', version=1, return_X_y=True)
pixel_values, targets = data
targets = targets.astype(int)
pixel_values = pixel_values.to_numpy()

In [None]:
single_img = pixel_values[0].reshape(28, 28)
plt.imshow(single_img, cmap='gray')

In [None]:
tsne = manifold.TSNE(n_components=2, random_state=42)
transformed_data = tsne.fit_transform(pixel_values[:3000, :])

In [None]:
tsne_df = pd.DataFrame(np.column_stack((transformed_data, targets[:3000])),
                       columns=["x", "y", "targets"])
tsne_df.loc[:, "targets"] = tsne_df.targets.astype(int)

In [None]:
grid = sns.FacetGrid(tsne_df, hue="targets", height=8)
grid.map(plt.scatter, "x", "y").add_legend()

## 交叉验证
使用红酒质量数据

In [None]:
import pandas as pd
df = pd.read_csv("data/winequality/winequality-red.csv", sep=';')

In [None]:
quality_mapping = {
 3: 0,
 4: 1,
 5: 2,
 6: 3,
 7: 4,
 8: 5
}
df.iloc[:, -1] = df.iloc[:, -1].map(quality_mapping)
df = df.sample(frac=1).reset_index(drop=True)
df_train = df.head(1000)
df_test = df.tail(df.shape[0]-1000)

In [None]:
# 使用sklearn做决策树
from sklearn import tree
from sklearn import metrics
clf = tree.DecisionTreeClassifier(max_depth=7)
cols = df.columns
feature_cols = [col for col in df.columns if col != 'quality']
clf.fit(df_train[feature_cols], df_train.quality)


In [None]:
train_predictions = clf.predict(df_train[feature_cols])
test_predictions = clf.predict(df_test[feature_cols])

# 计算训练数据集上预测的准确度
train_accuracy = metrics.accuracy_score(
 df_train.quality, train_predictions
)
display(train_accuracy)
# 计算测试数据集上预测的准确度
test_accuracy = metrics.accuracy_score(
 df_test.quality, test_predictions
)
display(test_accuracy)

演示过拟合

In [None]:
# 绘制图像，显示max_depth和结果准确度的关系
from sklearn import tree
from sklearn import metrics
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

matplotlib.rc(group="xtick", labelsize=20)
matplotlib.rc(group="ytick", labelsize=20)

%matplotlib inline

train_accuracies = [0.5]
test_accuracies = [0.5]

for depth in range(1, 25):
    clf = tree.DecisionTreeClassifier(max_depth=depth)
    cols = df.columns
    feature_cols = [col for col in df.columns if col != 'quality']
    clf.fit(df_train[feature_cols], df_train.quality)
    train_predictions = clf.predict(df_train[feature_cols])
    test_predictions = clf.predict(df_test[feature_cols])

    # 计算训练数据集上预测的准确度
    train_accuracy = metrics.accuracy_score(
    df_train.quality, train_predictions
    )
    # 计算测试数据集上预测的准确度
    test_accuracy = metrics.accuracy_score(
    df_test.quality, test_predictions
    )
    train_accuracies.append(train_accuracy)
    test_accuracies.append(test_accuracy)

plt.figure(figsize=(10, 5))
sns.set_style("whitegrid")
plt.plot(train_accuracies, label="train accuracy")
plt.plot(test_accuracies, label="test accuracy")
plt.legend(loc="upper left", prop={'size': 15})
plt.xticks(range(0, 26, 5))
plt.xlabel("max_depth", size=20)
plt.ylabel("accuracy", size=20)
plt.show()

### 分层k折交叉检验

In [None]:
# stratified-kfold for regression
# 为回归问题进行分层K-折交叉验证

# 导入需要的库
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn import model_selection

# 创建分折（folds）的函数
def create_folds(data):
    # 创建一个新列叫做kfold，并用-1来填充
    data["kfold"] = -1

    # 随机打乱数据的行
    data = data.sample(frac=1).reset_index(drop=True)

    # 使用Sturge规则计算bin的数量
    num_bins = int(np.floor(1 + np.log2(len(data))))

    # 使用pandas的cut函数进行目标变量（target）的分箱
    data.loc[:, "bins"] = pd.cut(
        data["target"], bins=num_bins, labels=False
    )

    # 初始化StratifiedKFold类
    kf = model_selection.StratifiedKFold(n_splits=5)

    # 填充新的kfold列
    # 注意：我们使用的是bins而不是实际的目标变量（target）！
    for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)):
        data.loc[v_, 'kfold'] = f

    # 删除bins列
    data = data.drop("bins", axis=1)

    # 返回包含folds的数据
    return data

# 主程序开始
if __name__ == "__main__":
    # 创建一个带有15000个样本、100个特征和1个目标变量的样本数据集
    X, y = datasets.make_regression(
        n_samples=15000, n_features=100, n_targets=1
    )
    # 使用numpy数组创建一个数据框
    df = pd.DataFrame(
        X,
        columns=[f"f_{i}" for i in range(X.shape[1])]
    )
    df.loc[:, "target"] = y

    # 创建folds
    df = create_folds(df)

## 文本处理
### imdb数据演示

In [None]:
from datasets import load_dataset
import pandas as pd

# 加载 IMDb 数据集
dataset = load_dataset("imdb")
df_train = pd.DataFrame(dataset['train'])
df_test = pd.DataFrame(dataset['test'])
df = pd.concat([df_train, df_test], ignore_index=True)

# 保存为 CSV 文件
df.to_csv("data/imdb/imdb.csv", index=False)
print("保存完成：imdb.csv")

In [None]:
import nltk
# 下载nltk资源
nltk.download()

In [5]:
import pandas as pd
from nltk.tokenize import word_tokenize
from sklearn import linear_model
from sklearn import metrics
from sklearn import naive_bayes
from sklearn import model_selection
from sklearn.feature_extraction.text import CountVectorizer

if __name__ == '__main__':
    df = pd.read_csv("data/imdb/imdb.csv")
    df['kfold'] = -1
    df = df.sample(frac=1).reset_index(drop=True)
    kf = model_selection.StratifiedKFold(n_splits=5)
    y = df.label.values
    for f, (t_,v_) in enumerate(kf.split(X=df, y=y)):
        df.loc[v_, 'kfold'] = f

    for fold_ in range(5):
        train_df = df[df.kfold != fold_].reset_index(drop=True)
        test_df = df[df.kfold == fold_].reset_index(drop=True)
        count_vec = CountVectorizer(tokenizer=word_tokenize, token_pattern=None)
        count_vec.fit(train_df.text)
        xtrain = count_vec.transform(train_df.text)
        xtest = count_vec.transform(test_df.text)
        model = naive_bayes.MultinomialNB()
        # model = linear_model.LogisticRegression()
        model.fit(X=xtrain, y=train_df.label)
        preds = model.predict(xtest)
        accuracy = metrics.accuracy_score(test_df.label, preds)
        print(f"Fold: {fold_}")
        print(f"Accuracy = {accuracy}")
        print("")


Fold: 0
Accuracy = 0.8439

Fold: 1
Accuracy = 0.845

Fold: 2
Accuracy = 0.8431

Fold: 3
Accuracy = 0.844

Fold: 4
Accuracy = 0.8461



['This', 'is', 'a', 'test', '.']