In [2]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
import jieba
import numpy as np
from sklearn.impute import SimpleImputer

In [3]:
def dictvec():
    """
    字典数据抽取
    :return: None
    """
    # 实例化
    # sparse改为True,输出的是每个不为零位置的坐标，稀疏矩阵可以节省存储空间
    #矩阵中存在大量的0，sparse存储只记录非零位置，节省空间的作用
    #Vectorizer中文含义是矢量器的含义
    dict1 = DictVectorizer(sparse=False)  # 把sparse改为True看看

    #每个样本都是一个字典，有三个样本
    # 调用fit_transform
    data = dict1.fit_transform([{'city': '北京', 'temperature': 100},
                                {'city': '上海', 'temperature': 60},
                                {'city': '深圳', 'temperature': 30}])
    test_data = [{'city': '北京', 'temperature': 80},
                 {'city': '深圳', 'temperature': 40}]
    print(data)
    print('-' * 50)
    # 字典中的一些类别数据，分别进行转换成特征
    print(dict1.get_feature_names_out())
    print('-' * 50)
    # print(dict1.inverse_transform(data))  #去看每个特征代表的含义，逆转回去
    x_test = dict1.transform(test_data)
    print(x_test)

    return None


dictvec()

[[  0.   1.   0. 100.]
 [  1.   0.   0.  60.]
 [  0.   0.   1.  30.]]
--------------------------------------------------
['city=上海' 'city=北京' 'city=深圳' 'temperature']
--------------------------------------------------
[[ 0.  1.  0. 80.]
 [ 0.  0.  1. 40.]]


In [4]:
def convec():
    
    vector = CountVectorizer(min_df=2)
    
    res = vector.fit_transform(
        ["life is  short,i like python life",
         "life is too long,i dislike python",
         "life is short"]
    )
    
    print(vector.get_feature_names_out())
    print('-' * 50)
    print(res)
    print('-' * 50)
    print(type(res))
    print('-' * 50)
    print(res.toarray())
    print('-' * 50)
    print(vector.inverse_transform(res))
    
convec()

['is' 'life' 'python' 'short']
--------------------------------------------------
<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 10 stored elements and shape (3, 4)>
  Coords	Values
  (0, 1)	2
  (0, 0)	1
  (0, 3)	1
  (0, 2)	1
  (1, 1)	1
  (1, 0)	1
  (1, 2)	1
  (2, 1)	1
  (2, 0)	1
  (2, 3)	1
--------------------------------------------------
<class 'scipy.sparse._csr.csr_matrix'>
--------------------------------------------------
[[1 2 1 1]
 [1 1 1 0]
 [1 1 0 1]]
--------------------------------------------------
[array(['life', 'is', 'short', 'python'], dtype='<U6'), array(['life', 'is', 'python'], dtype='<U6'), array(['life', 'is', 'short'], dtype='<U6')]


In [5]:
def cutword():
    """
    通过jieba对中文进行分词
    :return:
    """
    con1 = jieba.cut("今天很残酷，明天更残酷，后天很美好，但绝对大部分是死在明天晚上，所以每个人不要放弃今天。")

    con2 = jieba.cut("我们看到的从很远星系来的光是在几百万年之前发出的，这样当我们看到宇宙时，我们是在看它的过去。")

    con3 = jieba.cut(
        "如果只用一种方式了解某样事物，你就不会真正了解它。了解事物真正含义的秘密取决于如何将其与我们所了解的事物相联系。")

    # 转换成列表
    print(type(con1))
    print('-' * 50)
    # 把生成器转换成列表
    content1 = list(con1)
    content2 = list(con2)
    content3 = list(con3)
    print(content1)
    print(content2)
    print(content3)
    # 把列表转换成字符串,每个词之间用空格隔开
    print('-' * 50)
    c1 = ' '.join(content1)
    c2 = ' '.join(content2)
    c3 = ' '.join(content3)

    return c1, c2, c3

In [6]:
def tdif():
    tf = TfidfVectorizer(smooth_idf=True)
    
    a,b,c = cutword()
    
    data = tf.fit_transform([a,b,c])
    
    print(tf.get_feature_names_out())
    print('-' * 50)
    print(type(data))
    print('-' * 50)
    print(tf.inverse_transform(data))
    print('-' * 50)
    print(data.toarray())
    
tdif()

Building prefix dict from the default dictionary ...


<class 'generator'>
--------------------------------------------------


Dumping model to file cache C:\Users\false\AppData\Local\Temp\jieba.cache
Loading model cost 0.506 seconds.
Prefix dict has been built successfully.


['今天', '很', '残酷', '，', '明天', '更', '残酷', '，', '后天', '很', '美好', '，', '但', '绝对', '大部分', '是', '死', '在', '明天', '晚上', '，', '所以', '每个', '人', '不要', '放弃', '今天', '。']
['我们', '看到', '的', '从', '很', '远', '星系', '来', '的', '光是在', '几百万年', '之前', '发出', '的', '，', '这样', '当', '我们', '看到', '宇宙', '时', '，', '我们', '是', '在', '看', '它', '的', '过去', '。']
['如果', '只用', '一种', '方式', '了解', '某样', '事物', '，', '你', '就', '不会', '真正', '了解', '它', '。', '了解', '事物', '真正', '含义', '的', '秘密', '取决于', '如何', '将', '其', '与', '我们', '所', '了解', '的', '事物', '相', '联系', '。']
--------------------------------------------------
['一种' '不会' '不要' '之前' '了解' '事物' '今天' '光是在' '几百万年' '发出' '取决于' '只用' '后天' '含义'
 '大部分' '如何' '如果' '宇宙' '我们' '所以' '放弃' '方式' '明天' '星系' '晚上' '某样' '残酷' '每个'
 '看到' '真正' '秘密' '绝对' '美好' '联系' '过去' '这样']
--------------------------------------------------
<class 'scipy.sparse._csr.csr_matrix'>
--------------------------------------------------
[array(['今天', '残酷', '明天', '后天', '美好', '绝对', '大部分', '晚上', '所以', '每个', '不要',
       '放弃'], dtype='<U4'),

In [7]:
def minmax():
    """
    归一化处理
    :return: 
    """
    mm = MinMaxScaler(feature_range=(0,1))
    data = mm.fit_transform([[90, 2, 10, 40], [60, 4, 15, 45], [75, 3, 13, 46]])
    print(data)

minmax()
    

[[1.         0.         0.         0.        ]
 [0.         1.         1.         0.83333333]
 [0.5        0.5        0.6        1.        ]]


In [8]:
def sd():
    sd = StandardScaler()
    data = sd.fit_transform([[90, 2, 10, 40], [60, 4, 15, 45], [75, 3, 13, 46]])
    print(data)
    print(sd.mean_)
    print(sd.var_)
    data1 = sd.transform([[190, 22, 110, 404], [660, 46, 145, 455], [715, 376, 143, 416]])
    print(data1)
sd()

[[ 1.22474487 -1.22474487 -1.29777137 -1.3970014 ]
 [-1.22474487  1.22474487  1.13554995  0.50800051]
 [ 0.          0.          0.16222142  0.88900089]]
[75.          3.         12.66666667 43.66666667]
[150.           0.66666667   4.22222222   6.88888889]
[[  9.38971068  23.27015256  47.36865497 137.28713729]
 [ 47.76504998  52.66402947  64.40190419 156.71815672]
 [ 52.25578118 456.82983703  63.42857566 141.85914186]]


In [9]:
def Simpleinput():
    si = SimpleImputer(missing_values=np.nan,strategy='mean')
    data = si.fit_transform([[90, np.nan, 10, 40], [60, 4, 15, 45], [75, 3, 13, 46]])
    print(data)
Simpleinput()

[[90.   3.5 10.  40. ]
 [60.   4.  15.  45. ]
 [75.   3.  13.  46. ]]


In [13]:
def var():
    vt = VarianceThreshold(threshold=0.2)
    # 方差门槛 方差过滤
    data = vt.fit_transform([[0, 2, 0, 3],
                              [0, 1, 4, 3],
                              [0, 1, 1, 3]])
    print(data)
    print(vt.get_feature_names_out())
    print(vt.get_support(True))
var()
    
    

[[2 0]
 [1 4]
 [1 1]]
['x1' 'x2']
[1 2]


In [23]:
def pca():
    """
    主成分分析
    :return: 
    """
    p = PCA(n_components=0.99)
    odata = np.array([[2, 8, 4, 5],
                      [6, 3, 0, 8],
                      [5, 4, 9, 1]])
    print(np.var(odata, axis=0).sum())
    data = p.fit_transform([[2, 8, 4, 5],
                            [6, 3, 0, 8],
                            [5, 4, 9, 1]])
    
    print(data)
    print(np.var(data, axis=0).sum())
    print('-' * 50)
    print(p.explained_variance_ratio_)
    # 计算data的方差占总方差的比例
    print(p.explained_variance_ratio_.sum())
pca()
    
    
    

29.333333333333336
[[-1.28620952e-15  3.82970843e+00]
 [-5.74456265e+00 -1.91485422e+00]
 [ 5.74456265e+00 -1.91485422e+00]]
29.333333333333332
--------------------------------------------------
[0.75 0.25]
1.0


In [None]:
# load和fetch

In [24]:
import time

from sklearn.datasets import load_iris, fetch_20newsgroups, fetch_california_housing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score