# demo03_预测学生数学成绩

In [4]:
import numpy as np
import pandas as pd
import sklearn.preprocessing as sp

In [7]:
data = pd.read_csv(
    '../data/学生考试表现数据/StudentsPerformance.csv', 
    engine='python')

for k, v in data.loc[:, :'test preparation course'].items():
    e = sp.LabelEncoder()
    data[k] = e.fit_transform(v)
    print(e.classes_)
data = data.loc[:, :'math score']
data.head()

['female' 'male']
['group A' 'group B' 'group C' 'group D' 'group E']
["associate's degree" "bachelor's degree" 'high school' "master's degree"
 'some college' 'some high school']
['free/reduced' 'standard']
['completed' 'none']


Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score
0,0,1,1,1,1,72
1,0,2,4,1,0,69
2,0,1,3,1,1,90
3,1,0,0,0,1,47
4,1,2,4,1,1,76


## 训练模型

In [8]:
import sklearn.preprocessing as sp
import sklearn.pipeline as pl
import sklearn.model_selection as ms
import sklearn.metrics as sm

In [9]:
# 整理输入输出集
x, y = data.loc[:, :'test preparation course'], data['math score']
# 拆分测试集训练集
train_x, test_x, train_y, test_y = \
    ms.train_test_split(x, y, test_size=0.1, random_state=7)
train_x.shape, train_y.shape, test_x.shape, test_y.shape

((900, 5), (900,), (100, 5), (100,))

In [10]:
# 岭回归
import sklearn.linear_model as lm
import sklearn.metrics as sm

model = lm.Ridge()
model.fit(train_x, train_y)
# 评估
pred_train_y = model.predict(train_x)
pred_test_y = model.predict(test_x)
print('Training r2:', sm.r2_score(train_y, pred_train_y))
print('Testing r2:', sm.r2_score(test_y, pred_test_y))
print('Testing MAE:', sm.mean_absolute_error(test_y, pred_test_y))

Training r2: 0.2311355957090051
Testing r2: 0.15025561846513014
Testing MAE: 10.546468385023935


In [14]:
# 多项式回归
import sklearn.linear_model as lm
import sklearn.metrics as sm
import sklearn.preprocessing as sp
import sklearn.pipeline as pl

model = pl.make_pipeline(sp.PolynomialFeatures(4), lm.Ridge())
model.fit(train_x, train_y)
# 评估
pred_train_y = model.predict(train_x)
pred_test_y = model.predict(test_x)
print('Training r2:', sm.r2_score(train_y, pred_train_y))
print('Testing r2:', sm.r2_score(test_y, pred_test_y))
print('Testing MAE:', sm.mean_absolute_error(test_y, pred_test_y))

Training r2: 0.26983230373905376
Testing r2: 0.16615873222887845
Testing MAE: 10.486880624198921


In [17]:
import sklearn.ensemble as se
import sklearn.tree as st

model = st.DecisionTreeRegressor(max_depth=10)
model = se.AdaBoostRegressor(
            model, n_estimators=200, random_state=7)
model.fit(train_x, train_y)
pred_train_y = model.predict(train_x)
pred_test_y = model.predict(test_x)
print('Training r2:', sm.r2_score(train_y, pred_train_y))
print('Testing r2:', sm.r2_score(test_y, pred_test_y))

Training r2: 0.36096009163375153
Testing r2: -0.0569793610633067


In [18]:
model = se.GradientBoostingRegressor(
    max_depth=4, n_estimators=500, min_samples_split=3)
model.fit(train_x, train_y)
pred_train_y = model.predict(train_x)
pred_test_y = model.predict(test_x)
print('Training r2:', sm.r2_score(train_y, pred_train_y))
print('Testing r2:', sm.r2_score(test_y, pred_test_y))
print('Testing MAE:', sm.mean_absolute_error(test_y, pred_test_y))

Training r2: 0.37279728350574215
Testing r2: -0.02160164374415241
Testing MAE: 11.477396076567661


In [19]:
model = se.RandomForestRegressor(
    max_depth=10, n_estimators=200, min_samples_split=10)
model.fit(train_x, train_y)
pred_train_y = model.predict(train_x)
pred_test_y = model.predict(test_x)
print('Training r2:', sm.r2_score(train_y, pred_train_y))
print('Testing r2:', sm.r2_score(test_y, pred_test_y))
print('Testing MAE:', sm.mean_absolute_error(test_y, pred_test_y))

Training r2: 0.35671503869104393
Testing r2: 0.07885557870754312
Testing MAE: 10.796428702486686


In [23]:
# 决策树回归
import sklearn.metrics as sm
import sklearn.tree as st

# 如果出现了过拟合情况：
# 把max_depth调低一点、把min_samples_split调高一点
model = st.DecisionTreeRegressor(
        max_depth=4, min_samples_split=5)
model.fit(train_x, train_y)
# 评估
pred_train_y = model.predict(train_x)
pred_test_y = model.predict(test_x)
print('Training r2:', sm.r2_score(train_y, pred_train_y))
print('Testing r2:', sm.r2_score(test_y, pred_test_y))
print('Testing MAE:', sm.mean_absolute_error(test_y, pred_test_y))

Training r2: 0.24319209298809463
Testing r2: 0.09276542667560894
Testing MAE: 10.980674408816935


In [34]:
# 把测试结果放在一起，比较一下，看一下那些样本问题最大
r = pd.DataFrame({'test_y':test_y, 'pred_test_y':pred_test_y})
r['diff'] = np.abs(r['test_y']-r['pred_test_y'])
r = r.sort_values(by='diff', ascending=False)
r
# 获取问题样本
data.loc[r.head(10).index]

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score
689,1,4,4,0,1,93
339,0,3,5,0,0,35
299,1,3,0,0,1,90
979,0,2,0,1,1,91
786,0,4,5,0,1,72
962,0,4,0,1,1,100
34,1,4,4,1,1,97
125,0,1,2,1,1,87
277,0,4,2,1,0,59
377,0,3,3,0,0,85
