In [1]:
import numpy as np, pandas as pd
from sklearn.ensemble import RandomForestRegressor# 随机森林回归
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression# 线性回归
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
tips = pd.read_csv('data/tips.csv')
print("原始数据大小：",tips.shape)
print(tips.head(5))

原始数据大小： (244, 7)
   total_bill   tip     sex smoker  day    time  size
0       16.99  1.01  Female     No  Sun  Dinner     2
1       10.34  1.66    Male     No  Sun  Dinner     3
2       21.01  3.50    Male     No  Sun  Dinner     3
3       23.68  3.31    Male     No  Sun  Dinner     2
4       24.59  3.61  Female     No  Sun  Dinner     4


In [3]:
target = tips['tip']
data = tips.iloc[:,tips.columns!='tip']
# data = tips.drop(['tip'], axis=1)# 不改变原有对象，当inplace=False，默认该删除操作不改变原数据，而是返回一个执行删除操作后的新dataframe；
data.head(5)

Unnamed: 0,total_bill,sex,smoker,day,time,size
0,16.99,Female,No,Sun,Dinner,2
1,10.34,Male,No,Sun,Dinner,3
2,21.01,Male,No,Sun,Dinner,3
3,23.68,Male,No,Sun,Dinner,2
4,24.59,Female,No,Sun,Dinner,4


In [4]:
data = pd.get_dummies(data, columns=['sex', 'smoker','day', 'time'])
data.head(10)

Unnamed: 0,total_bill,size,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch
0,16.99,2,1,0,1,0,0,0,1,0,1,0
1,10.34,3,0,1,1,0,0,0,1,0,1,0
2,21.01,3,0,1,1,0,0,0,1,0,1,0
3,23.68,2,0,1,1,0,0,0,1,0,1,0
4,24.59,4,1,0,1,0,0,0,1,0,1,0
5,25.29,4,0,1,1,0,0,0,1,0,1,0
6,8.77,2,0,1,1,0,0,0,1,0,1,0
7,26.88,4,0,1,1,0,0,0,1,0,1,0
8,15.04,2,0,1,1,0,0,0,1,0,1,0
9,14.78,2,0,1,1,0,0,0,1,0,1,0


In [5]:
d = {'sex_Male':'male', 'sex_Female':'female','smoker_Yes':'smoker', 'smoker_No':'non-smoker',
     'day_Thur':'th', 'day_Fri':'fri', 'day_Sat':'sat','day_Sun':'sun', 'time_Lunch':'lunch',
     'time_Dinner':'dinner'}
data = data.rename(index=str, columns=d)
data.columns

Index(['total_bill', 'size', 'female', 'male', 'non-smoker', 'smoker', 'fri',
       'sat', 'sun', 'th', 'dinner', 'lunch'],
      dtype='object')

In [6]:
X = data.values
y = target.values# tip那一列
print ('X and y shapes (post conversion):')
print (X.shape, y.shape)

X and y shapes (post conversion):
(244, 12) (244,)


In [7]:
X_vector = np.array([30.00, 'NaN',1, 0, 1, 0, 0, 0, 0, 1, 1, 0])
y_vector = np.array([4.5])

In [8]:
X = np.vstack([X, X_vector])# 行增加
y = np.append(y, y_vector)# 目标数据也加一行，这里是加一个

In [9]:
X.shape

(245, 12)

In [10]:
print ('new X and y data point:')
print (X[244], y[244])# 刚刚添加的

new X and y data point:
['30.0' 'NaN' '1' '0' '1' '0' '0' '0' '0' '1' '1' '0'] 4.5


In [11]:
X_vectors = np.array([[24.99, 'NaN', 0, 1, 0, 1, 1, 0, 0, 0, 0, 1],
                     [19.99, 'NaN',1, 0, 1, 0, 0, 0, 0, 1, 1, 0]])
y_vectors = np.array([[3.5], [2.0]])
X = np.vstack([X, X_vectors])# 添加两行
y = np.append(y, y_vectors)
print('new X and y data points:')
print(X[245], y[245])
print(X[246], y[246])

new X and y data points:
['24.99' 'NaN' '0' '1' '0' '1' '1' '0' '0' '0' '0' '1'] 3.5
['19.99' 'NaN' '1' '0' '1' '0' '0' '0' '0' '1' '1' '0'] 2.0


In [12]:
# 数据缺失值补全
imputer = SimpleImputer()
imputer.fit(X)
X = imputer.transform(X)#尝试补全NAN

In [13]:
print ('new data shape:', X.shape)
print ('new records post imputation (features and targets):')
print (X[244],y[244])
print (X[245],y[245])
print (X[246], y[246])

new data shape: (247, 12)
new records post imputation (features and targets):
[30.          2.56967213  1.          0.          1.          0.
  0.          0.          0.          1.          1.          0.        ] 4.5
[24.99        2.56967213  0.          1.          0.          1.
  1.          0.          0.          0.          0.          1.        ] 3.5
[19.99        2.56967213  1.          0.          1.          0.
  0.          0.          0.          1.          1.          0.        ] 2.0


In [14]:
# 定义一个随机森林回归，然后fit
rfr = RandomForestRegressor(random_state=0, n_estimators=100)
rfr.fit(X, y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=0, verbose=0,
                      warm_start=False)

In [15]:
feature_importances = rfr.feature_importances_# 生成各个featurs的权重
feature = list(data.columns.values)
importance = sorted(zip(feature_importances,feature),reverse=True)

In [17]:
print("feature importance (first 6 features)")
[print(row)for i,row in enumerate(importance) if i<6]


feature importance (first 6 features)
(0.7597845511444516, 'total_bill')
(0.06437753343804929, 'size')
(0.036634219162666466, 'non-smoker')
(0.03360397711716997, 'smoker')
(0.026410154999617016, 'sat')
(0.021865644745990637, 'sun')


[None, None, None, None, None, None]

In [19]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=0)
lr_model = LinearRegression()
lr_model_name = lr_model.__class__.__name__
lr_model.fit(X_train,y_train)
y_pred = lr_model.predict(X_test)

In [22]:
# MSE误差
rmse = np.sqrt(mean_squared_error(y_test,y_pred))
print('LinearRegression MSE:',rmse)


LinearRegression MSE: 0.9474705746817211


In [23]:
p1 = [X[244]]
p2 = [X[245], X[246]]
y1, y2 = lr_model.predict(p1), lr_model.predict(p2)
print (y[244], y1[0])
print (y[245], y2[0])
print (y[246], y2[1])


4.5 3.827512419066448
3.5 3.5664995107583293
2.0 2.9415950382447313


In [25]:
X_file = 'data/X_tips'
y_file = 'data/y_tips'
np.save(X_file, X)
np.save(y_file, y)