In [126]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import jieba
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from scipy.stats import pearsonr
from sklearn.decomposition import PCA

In [11]:
def datesets_demo():
    """
    sklearn 数据集使用
    :return:
    """
    # 获取数据集
    iris = load_iris()
    print("鸢尾花数据集的返回值：\n", iris)
    print("鸢尾花的描述：\n", iris.DESCR)
    print("鸢尾花特征的名字：\n", iris.feature_names)
    print("鸢尾花的特征值:\n", iris.data, iris.data.shape)
    return None

In [24]:
def split_data():
    iris = load_iris()
    # 数据集划分
    # 训练集的特征值x_train 测试集的特征值x_test 训练集的目标值y_train 测试集的目标值y_test
    x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=22)
    print("x_train训练集特征值:\n", x_train, x_train.shape)

    return None

In [38]:
def dict_demo():
    """
    字典特征抽取
    :return:
    """
    data = [{'city': '北京','temperature':100}, {'city': '上海','temperature':60}, {'city': '深圳','temperature':30}]

    # 1. 实例化一个转换器类
    transfer = DictVectorizer(sparse=False)

    # 2. 调用fit_transform()
    data_new = transfer.fit_transform(data)
    print("data_new:\n",data_new)

    print("特征名字:\n",transfer.feature_names_)

    return None

In [None]:
def count_chinese_demo():
    #1. new transder
    transfer = CountVectorizer()
    #2. use fit_transform
    data_new = transfer.fit_transform(data)
    print("文本特征抽取的结果：\n", data_new.toarray())
    print("返回特征名字：\n", transfer.get_feature_names())

In [47]:
def count_demo():
    """
    对文本进行特征抽取，countvetorizer
    :return: None
    """
    data = ["life is short,i like like python", "life is too long,i dislike python"]

    transfer = CountVectorizer()

    data_new = transfer.fit_transform(data)

    print("文本特征抽取的结果：\n", data_new.toarray())
    print("返回特征名字：\n", transfer.get_feature_names())



In [58]:
def text_chinese_count_demo2():
    """
    对中文进行特征抽取
    :return: None
    """

    data = ["一种还是一种今天很残酷，明天更残酷，后天很美好，但绝对大部分是死在明天晚上，所以每个人不要放弃今天。",
            "我们看到的从很远星系来的光是在几百万年之前发出的，这样当我们看到宇宙时，我们是在看它的过去。",
            "如果只用一种方式了解某样事物，你就不会真正了解它。了解事物真正含义的秘密取决于如何将其与我们所了解的事物相联系。"]
    data_new = []
    for sent in data:
            data_new.append(cut_word(sent))

    print(data_new)
    transfer = CountVectorizer()
    #2. use fit_transform
    data_final = transfer.fit_transform(data_new)
    print("文本特征抽取的结果：\n", data_final.toarray())
    print("返回特征名字：\n", transfer.get_feature_names())

    return None

In [98]:
def minmax_demo():
    """
    归一化演示
    :return: None
    """
    data = pd.read_csv("dating.txt",sep="\t")
    transfer = MinMaxScaler(feature_range(2, 3))
    data_new = transfer.fit_transform(data)

    print("data_new:\n", data_new)
    return None

In [56]:
def cut_word(text):
    """
    对中文进行分词
    "我爱北京天安门"————>"我 爱 北京 天安门"
    :param text:
    :return: text
    """
    # 用结巴对中文字符串进行分词
    text = " ".join(list(jieba.cut(text)))
    # a = " ".join(jieba.cut(text))
    # print(a)
    # print(type(a))

    return text

In [106]:
def stand_demo():
    """
    标准化演示
    :return: None
    """
    data = pd.read_csv("dating.txt",sep="\t")
    print(data)
    # 1、实例化一个转换器类
    transfer = StandardScaler()
    # 2、调用fit_transform
    data = transfer.fit_transform(data[['milage','Liters','Consumtime']])
    print("标准化的结果:\n", data)
    print("每一列特征的平均值：\n", transfer.mean_)
    print("每一列特征的方差：\n", transfer.var_) 

    return None

In [124]:
def variance_demo():
    """
    删除低方差特征——特征选择
    :return: None
    """
    data = pd.read_csv("factor_returns.csv")
    data = data.iloc[:, 1:-2]
    print("data:\n",data)

    transfer = VarianceThreshold(threshold=10)

    data_new = transfer.fit_transform(data)

    print("data_new:\n",data_new,data_new.shape)

    r = pearsonr(data["pe_ratio"],data["pb_ratio"])

    print("相关系数:\n",r)

    

    return None

In [129]:
def pca_demo():
    data = [[2,8,4,5], [6,3,0,8], [5,4,9,1]]

    transfer = PCA(n_components=2)

    data_new = transfer.fit_transform(data)

    print("data_new:\n",data_new)

    # 1、实例化PCA, 小数——保留多少信息
    transfer1 = PCA(n_components=0.9)
    # 2、调用fit_transform
    data1 = transfer1.fit_transform(data)

    print("保留90%的信息，降维结果为：\n", data1)


In [130]:
if __name__ == '__main__':
    # 代码1:sklearn 数据集使用
    # split_data()
    # dict_demo()
    # count_demo()
    # cut_word("我爱北京天安门")
    # text_chinese_count_demo2()
    # stand_demo()
    # minmax_demo()
    # variance_demo()
    pca_demo()

data_new:
 [[ 1.28620952e-15  3.82970843e+00]
 [ 5.74456265e+00 -1.91485422e+00]
 [-5.74456265e+00 -1.91485422e+00]]
保留90%的信息，降维结果为：
 [[ 1.28620952e-15  3.82970843e+00]
 [ 5.74456265e+00 -1.91485422e+00]
 [-5.74456265e+00 -1.91485422e+00]]
