In [1]:
import pandas as pd, numpy as np
from sklearn.feature_extraction import DictVectorizer
'''DictVectorizer的处理对象是符号化(非数字化)的但是具有一定结构的特征数据，如字典等，将符号转成数字0/1表示'''
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from random import randint

In [25]:
tips = pd.read_csv('data/tips.csv')
tips.head(5)                

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [26]:
data = tips.iloc[:,tips.columns!='tip']
target = tips['tip']
data.head(5)

Unnamed: 0,total_bill,sex,smoker,day,time,size
0,16.99,Female,No,Sun,Dinner,2
1,10.34,Male,No,Sun,Dinner,3
2,21.01,Male,No,Sun,Dinner,3
3,23.68,Male,No,Sun,Dinner,2
4,24.59,Female,No,Sun,Dinner,4


In [27]:
v = ['sex','smoker','day','time']
ls = data[v].to_dict(orient='records')
# 形成[{column -> value}, … , {column -> value}]的结构
ls

[{'sex': 'Female', 'smoker': 'No', 'day': 'Sun', 'time': 'Dinner'},
 {'sex': 'Male', 'smoker': 'No', 'day': 'Sun', 'time': 'Dinner'},
 {'sex': 'Male', 'smoker': 'No', 'day': 'Sun', 'time': 'Dinner'},
 {'sex': 'Male', 'smoker': 'No', 'day': 'Sun', 'time': 'Dinner'},
 {'sex': 'Female', 'smoker': 'No', 'day': 'Sun', 'time': 'Dinner'},
 {'sex': 'Male', 'smoker': 'No', 'day': 'Sun', 'time': 'Dinner'},
 {'sex': 'Male', 'smoker': 'No', 'day': 'Sun', 'time': 'Dinner'},
 {'sex': 'Male', 'smoker': 'No', 'day': 'Sun', 'time': 'Dinner'},
 {'sex': 'Male', 'smoker': 'No', 'day': 'Sun', 'time': 'Dinner'},
 {'sex': 'Male', 'smoker': 'No', 'day': 'Sun', 'time': 'Dinner'},
 {'sex': 'Male', 'smoker': 'No', 'day': 'Sun', 'time': 'Dinner'},
 {'sex': 'Female', 'smoker': 'No', 'day': 'Sun', 'time': 'Dinner'},
 {'sex': 'Male', 'smoker': 'No', 'day': 'Sun', 'time': 'Dinner'},
 {'sex': 'Male', 'smoker': 'No', 'day': 'Sun', 'time': 'Dinner'},
 {'sex': 'Female', 'smoker': 'No', 'day': 'Sun', 'time': 'Dinner'},
 {

In [28]:
vector = DictVectorizer(sparse=False,dtype=int)
# 可以理解为一个转码器
# Transforms lists of feature-value mappings to vectors.

In [29]:
d = vector.fit_transform(ls)
print("one hot encoding:\n",d[0:3])# 打印前3个独热编码


one hot encoding:
 [[0 0 1 0 1 0 1 0 1 0]
 [0 0 1 0 0 1 1 0 1 0]
 [0 0 1 0 0 1 1 0 1 0]]


In [30]:
encode_order = vector.get_feature_names()
encode_order

['day=Fri',
 'day=Sat',
 'day=Sun',
 'day=Thur',
 'sex=Female',
 'sex=Male',
 'smoker=No',
 'smoker=Yes',
 'time=Dinner',
 'time=Lunch']

In [35]:
print(data.columns)
data_new = data.drop(['sex', 'smoker', 'day', 'time'], axis=1)# 更新到原有的上面去了，发生了替换，导致再更新运算出错
X = data_new.values
print ('feature shape after removing categorical columns:')
print (X.shape,data_new.columns)

Index(['total_bill', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')
feature shape after removing categorical columns:
(244, 2) Index(['total_bill', 'size'], dtype='object')


In [36]:
Xls, dls = X.tolist(), d.tolist()
X = [np.array(row + dls[i]) for i, row in enumerate(Xls)]
X = np.array(X)
y = target.values
print ('feature shape after adding encoded data back:')
print (X.shape)

feature shape after adding encoded data back:
(244, 12)


In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
lr_model = LinearRegression(fit_intercept=True)
lr_model_name = lr_model.__class__.__name__
lr_model.fit(X_train, y_train)
y_pred = lr_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(rmse, '(rmse)')

0.9636287548943022 (rmse)


In [41]:
print('predict 1st test set element (actual/prediction):')
print(y_test[0], y_pred[0])
rints = [randint(0, y.shape[0]-1) for row in range(3)]
print ('random integers:', rints)
p = [X[rints[0]], X[rints[1]], X[rints[2]]]
y_p = lr_model.predict(p)
y_p = list(np.around(y_p, 2))
print(y_p, '(predicted)')
print([y[rints[0]], y[rints[1]], y[rints[2]]], '(actual)')

predict 1st test set element (actual/prediction):
2.64 2.81211304380231
random integers: [37, 148, 159]
[2.76, 2.02, 3.04] (predicted)
[3.07, 1.73, 2.0] (actual)
