In [1]:
import pandas as pd
X = pd.DataFrame({'城市':['东京', '东京', '伦敦', '西雅图', '旧金山', '东京'], 
                  '布尔列':['yes', 'no', 'yes', 'no', 'no', 'yes'], 
                  '顺序列':['有点喜欢', '喜欢', '有点喜欢', '喜欢', '有点喜欢', '不喜欢'], 
                  '定量列':[1, 11, -.5, 10, 8, 20]})

In [2]:
X

Unnamed: 0,城市,布尔列,顺序列,定量列
0,东京,yes,有点喜欢,1.0
1,东京,no,喜欢,11.0
2,伦敦,yes,有点喜欢,-0.5
3,西雅图,no,喜欢,10.0
4,旧金山,no,有点喜欢,8.0
5,东京,yes,不喜欢,20.0


In [3]:
pd.get_dummies(X,columns = ['城市','布尔列'],#要虚拟化的列
               prefix_sep='_') #前缀(列名)和单元格值之间的分隔符

Unnamed: 0,顺序列,定量列,城市_东京,城市_伦敦,城市_旧金山,城市_西雅图,布尔列_no,布尔列_yes
0,有点喜欢,1.0,1,0,0,0,0,1
1,喜欢,11.0,1,0,0,0,1,0
2,有点喜欢,-0.5,0,1,0,0,0,1
3,喜欢,10.0,0,0,0,1,1,0
4,有点喜欢,8.0,0,0,1,0,1,0
5,不喜欢,20.0,1,0,0,0,0,1


In [4]:
#创建一个列表，顺序数据对应于列表索引
ordering = ['不喜欢', '有点喜欢', '喜欢'] # 0 是dislike ， 1 是somewhat like, 2是like
#在将ordering 映射到顺序列之前，先看一下列
print(X['顺序列'])

0    有点喜欢
1      喜欢
2    有点喜欢
3      喜欢
4    有点喜欢
5     不喜欢
Name: 顺序列, dtype: object


In [5]:
lambda x: ordering.index(x)

<function __main__.<lambda>(x)>

In [6]:
#将ordering映射到顺序列
X['顺序列'].map(lambda x: ordering.index(x))

0    1
1    2
2    1
3    2
4    1
5    0
Name: 顺序列, dtype: int64

In [7]:
#默认的类别名就是分箱
pd.cut(X['定量列'], bins= 3)

0     (-0.52, 6.333]
1    (6.333, 13.167]
2     (-0.52, 6.333]
3    (6.333, 13.167]
4    (6.333, 13.167]
5     (13.167, 20.0]
Name: 定量列, dtype: category
Categories (3, interval[float64]): [(-0.52, 6.333] < (6.333, 13.167] < (13.167, 20.0]]

In [8]:
#不使用标签
pd.cut(X['定量列'], bins=3, labels=False)

0    0
1    1
2    0
3    1
4    1
5    2
Name: 定量列, dtype: int64

In [9]:
df = pd.read_csv('./data/1.csv', header=None)
df.columns = ['index', 'x', 'y', 'z', 'activity']
df.head()

Unnamed: 0,index,x,y,z,activity
0,0.0,1502,2215,2153,1
1,1.0,1667,2072,2047,1
2,2.0,1611,1957,1906,1
3,3.0,1601,1939,1831,1
4,4.0,1643,1965,1879,1


In [10]:
from sklearn.preprocessing import PolynomialFeatures

In [11]:
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)

In [12]:
X = df[['x','y','z']]
X_poly = poly.fit_transform(X)
X_poly.shape

(162501, 9)

In [13]:
pd.DataFrame(X_poly,columns=poly.get_feature_names()).head()

Unnamed: 0,x0,x1,x2,x0^2,x0 x1,x0 x2,x1^2,x1 x2,x2^2
0,1502.0,2215.0,2153.0,2256004.0,3326930.0,3233806.0,4906225.0,4768895.0,4635409.0
1,1667.0,2072.0,2047.0,2778889.0,3454024.0,3412349.0,4293184.0,4241384.0,4190209.0
2,1611.0,1957.0,1906.0,2595321.0,3152727.0,3070566.0,3829849.0,3730042.0,3632836.0
3,1601.0,1939.0,1831.0,2563201.0,3104339.0,2931431.0,3759721.0,3550309.0,3352561.0
4,1643.0,1965.0,1879.0,2699449.0,3228495.0,3087197.0,3861225.0,3692235.0,3530641.0


In [14]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = ['This is the first document.',
          'This document is the second document.',
          'And this is the third one.',
          'Is this the first document?']
vectorizer = CountVectorizer()
dt = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names()

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']

In [15]:
dt.toarray()

array([[0, 1, 1, 1, 0, 0, 1, 0, 1],
       [0, 2, 0, 1, 0, 1, 1, 0, 1],
       [1, 0, 0, 1, 1, 0, 1, 1, 1],
       [0, 1, 1, 1, 0, 0, 1, 0, 1]], dtype=int64)

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
        'This is the first document.',
        'This document is the second document.',
        'And this is the third one.',
        'Is this the first document?',
]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names()

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']

In [2]:
X.toarray()

array([[0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524],
       [0.        , 0.6876236 , 0.        , 0.28108867, 0.        ,
        0.53864762, 0.28108867, 0.        , 0.28108867],
       [0.51184851, 0.        , 0.        , 0.26710379, 0.51184851,
        0.        , 0.26710379, 0.51184851, 0.26710379],
       [0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524]])

In [26]:
items=vectorizer.vocabulary_.items()
items

dict_items([('this', 8), ('is', 3), ('the', 6), ('first', 2), ('document', 1), ('second', 5), ('and', 0), ('third', 7), ('one', 4)])

In [19]:
feature_dict = {v: k for k, v in vectorizer.vocabulary_.items()}

In [20]:
X.toarray()

array([[0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524],
       [0.        , 0.6876236 , 0.        , 0.28108867, 0.        ,
        0.53864762, 0.28108867, 0.        , 0.28108867],
       [0.51184851, 0.        , 0.        , 0.26710379, 0.51184851,
        0.        , 0.26710379, 0.51184851, 0.26710379],
       [0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524]])

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
    'I love dogs.',
    'I hate dogs and knitting.',
    'Knitting is my hobby and passion.'
]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names()

['and', 'dogs', 'hate', 'hobby', 'is', 'knitting', 'love', 'my', 'passion']

In [14]:
X.toarray()

array([[0.        , 0.60534851, 0.        , 0.        , 0.        ,
        0.        , 0.79596054, 0.        , 0.        ],
       [0.45985353, 0.45985353, 0.60465213, 0.        , 0.        ,
        0.45985353, 0.        , 0.        , 0.        ],
       [0.3349067 , 0.        , 0.        , 0.44036207, 0.44036207,
        0.3349067 , 0.        , 0.44036207, 0.44036207]])