In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [26]:
# 数据导入
dataset = pd.read_csv("D:\Codes\data\smoking_trend.csv")
dataset.head()

Unnamed: 0,Country,Year,Data.Daily cigarettes,Data.Percentage.Male,Data.Percentage.Female,Data.Percentage.Total,Data.Smokers.Total,Data.Smokers.Female,Data.Smokers.Male
0,Afghanistan,1980,5.7,10.4,18.4,2.4,733520,81707,651813
1,Afghanistan,1981,5.8,10.5,18.4,2.3,720102,79276,640826
2,Afghanistan,1982,5.8,10.5,18.5,2.3,700415,76061,624355
3,Afghanistan,1983,5.9,10.5,18.6,2.3,676984,72411,604572
4,Afghanistan,1984,6.0,10.6,18.6,2.3,653812,68908,584905


In [27]:
# 数据清洗
# 空
dataset.isna().sum()

Country                   0
Year                      0
Data.Daily cigarettes     0
Data.Percentage.Male      0
Data.Percentage.Female    0
Data.Percentage.Total     0
Data.Smokers.Total        0
Data.Smokers.Female       0
Data.Smokers.Male         0
dtype: int64

In [28]:
# 重
dataset.duplicated().sum()

0

In [29]:
# 无用列
dataset.nunique()

Country                    188
Year                        33
Data.Daily cigarettes      554
Data.Percentage.Male       403
Data.Percentage.Female     571
Data.Percentage.Total      359
Data.Smokers.Total        6187
Data.Smokers.Female       6085
Data.Smokers.Male         6194
dtype: int64

In [30]:
# 异常
dataset.describe()

Unnamed: 0,Year,Data.Daily cigarettes,Data.Percentage.Male,Data.Percentage.Female,Data.Percentage.Total,Data.Smokers.Total,Data.Smokers.Female,Data.Smokers.Male
count,6204.0,6204.0,6204.0,6204.0,6204.0,6204.0,6204.0,6204.0
mean,1996.0,19.851854,18.988878,28.998179,9.149017,9312808.0,1687367.0,7625441.0
std,9.522672,13.754893,9.181818,12.971933,8.902388,66624580.0,11726040.0,55272140.0
min,1980.0,1.0,3.0,5.0,0.5,1711.0,297.0,1248.0
25%,1988.0,12.7,11.0,17.9,2.3,164767.8,20712.5,137469.8
50%,1996.0,19.1,18.5,27.9,4.9,675578.0,95724.5,512561.5
75%,2004.0,23.9,26.1,38.400002,15.0,2243216.0,502533.5,1643580.0
max,2012.0,135.89999,50.599998,65.599998,38.099998,967313700.0,161025500.0,807280600.0


In [31]:
# 类型
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6204 entries, 0 to 6203
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Country                 6204 non-null   object 
 1   Year                    6204 non-null   int64  
 2   Data.Daily cigarettes   6204 non-null   float64
 3   Data.Percentage.Male    6204 non-null   float64
 4   Data.Percentage.Female  6204 non-null   float64
 5   Data.Percentage.Total   6204 non-null   float64
 6   Data.Smokers.Total      6204 non-null   int64  
 7   Data.Smokers.Female     6204 non-null   int64  
 8   Data.Smokers.Male       6204 non-null   int64  
dtypes: float64(4), int64(4), object(1)
memory usage: 436.3+ KB


In [32]:
# 特征工程
from sklearn.model_selection import train_test_split
import pickle
columns = ['Country', 'Year', 'Data.Daily cigarettes']
data = dataset[columns]
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6204 entries, 0 to 6203
Data columns (total 3 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Country                6204 non-null   object 
 1   Year                   6204 non-null   int64  
 2   Data.Daily cigarettes  6204 non-null   float64
dtypes: float64(1), int64(1), object(1)
memory usage: 145.5+ KB


In [33]:
# 特征处理
# 独热编码定类离散数据
columns = ['Country']
data_encoder = pd.get_dummies(data[columns], dtype=float)
data = pd.concat([data, data_encoder], axis=1)
data.drop(columns, axis=1, inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6204 entries, 0 to 6203
Columns: 190 entries, Year to Country_Zimbabwe
dtypes: float64(189), int64(1)
memory usage: 9.0 MB


In [34]:
# 划分训练数据集和测试数据集
X = data.drop('Data.Daily cigarettes', axis=1)
y = data['Data.Daily cigarettes']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((4963, 189), (4963,), (1241, 189), (1241,))

In [35]:
# 数据建模
# 模型选择
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR

regs = [
    LinearRegression(),
    SVR(),
]
for i, reg in enumerate(regs):
    reg.fit(X_train, y_train)
    y_pred = reg.predict(X_test)
    print(str(i)+'-'*30)
    print("Cross Valication Score:", np.mean(cross_val_score(reg, X, y, cv=5)))

0------------------------------
Cross Valication Score: 0.02006328105218269
1------------------------------
Cross Valication Score: -0.0040714216622409925


In [36]:
# 模型优化
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingRegressor, AdaBoostRegressor
# 参数调优

# 集成学习
reg = LinearRegression()
reg_best = BaggingRegressor(base_estimator=reg, n_estimators=100, random_state=7)
reg_best.fit(X_train, y_train)



In [37]:
# 模型评估
from sklearn.metrics import mean_squared_error, r2_score
y_pred = reg_best.predict(X_test)
# mse, r2
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("MES:\t{}\nR2:\t{}".format(mse, r2))

MES:	15.21180627177811
R2:	0.9142949096570181


In [38]:
# 模型保存
with open('model.pkl', 'wb') as f:
    pickle.dump(reg_best, f)