# 随机森林算法（泰坦尼克号预测）

In [21]:
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt

# 1.读取数据
data = pd.read_csv("./data/titanic/train.csv")
print(data.head())
print(data.info())

# 2.数据处理
x = data[['Pclass','Sex','Age']].copy()
y = data['Survived'].copy()
print(x.head(10))
x['Age'].fillna(x['Age'].mean(),inplace = True)
print(x.head())
x = pd.get_dummies(x)

# 数据集划分
x_train, x_test,y_train, y_test = train_test_split(x, y, test_size=0.2)

# 3.模型训练
# (1)决策树
tree = DecisionTreeClassifier()
tree.fit(x_train, y_train)

# (2)随机森林
rf = RandomForestClassifier()
rf.fit(x_train, y_train)

# （3）网格搜索交叉验证
params = {'n_estimators':[10,20],'max_depth':[2,3,4,5]}
model = GridSearchCV(estimator=rf,param_grid=params,cv=3)
model.fit(x_train, y_train)
print(model.best_estimator_)

rfs = RandomForestClassifier(max_depth=5, n_estimators=10)
rfs.fit(x_train, y_train)

# 4.模型评估
# （1）决策树
print("------决策树分数------")
print(tree.score(x_test, y_test))

# （2）随机森林
print("------随机森林分数------")
print(rf.score(x_test, y_test))

# （3）网络搜索交叉验证
print("------最好模型-------")
print(rfs.score(x_test, y_test))


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
<c

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  x['Age'].fillna(x['Age'].mean(),inplace = True)


RandomForestClassifier(max_depth=4, n_estimators=10)
------决策树分数------
0.8268156424581006
------随机森林分数------
0.8268156424581006
------最好模型-------
0.8379888268156425


# AdaBoost算法实战葡萄酒数据

In [22]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
# 1.读取数据
data = pd.read_csv('./data/wine0501.csv')
print(data.info())
data = data [data['Class label'] != 1]
x = data[['Alcohol','Hue']].copy()
y = data['Class label'].copy()
print(y)

# 转换成二分类问题，用0，1表示
pre = LabelEncoder()
y = pre.fit_transform(y)
print(y)

x_train, x_test, y_train, y_test = train_test_split(x, y)
# 2.模型训练
ada = AdaBoostClassifier()
ada.fit(x_train, y_train)


# 3.模型评估
print('Adaboost预测值---->',ada.score(x_test, y_test))



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Class label                   178 non-null    int64  
 1   Alcohol                       178 non-null    float64
 2   Malic acid                    178 non-null    float64
 3   Ash                           178 non-null    float64
 4   Alcalinity of ash             178 non-null    float64
 5   Magnesium                     178 non-null    int64  
 6   Total phenols                 178 non-null    float64
 7   Flavanoids                    178 non-null    float64
 8   Nonflavanoid phenols          178 non-null    float64
 9   Proanthocyanins               178 non-null    float64
 10  Color intensity               178 non-null    float64
 11  Hue                           178 non-null    float64
 12  OD280/OD315 of diluted wines  178 non-null    float64
 13   Prol

# GBDT（泰坦尼克生存预测）

In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier

# 读取数据
data = pd.read_csv('./data/titanic/train.csv')
print(data.head())
print(data.info())

# 数据处理
x = data[['Pclass','Sex','Age']].copy()
y = data['Survived'].copy()
print(x.head(10))
x['Age'].fillna(x['Age'].mean(),inplace = True)
print(x.head(10))
x = pd.get_dummies(x)
print(x.head(10))

x_train, x_test, y_train,y_test = train_test_split(x, y, test_size=0.2)

# 训练模型
model = GradientBoostingClassifier()
model.fit(x_train, y_train)
print(model.score(x_test, y_test))

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
<c

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  x['Age'].fillna(x['Age'].mean(),inplace = True)


# xgb案例：红酒品质分类（多分类）

In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.utils import class_weight
# 数据处理

data = pd.read_csv('./data/红酒品质分类.csv')
print(data.info())
print(data.head())

x = data.iloc[:,:-1] # 除最后一列外，每列均是特征
y = data.iloc[:, -1]-3

x_train,x_test,y_train,y_test = train_test_split(x, y ,test_size=0.2,stratify=y)

pd.concat([x_train, y_train],axis = 1).to_csv('红酒品质分类_train.csv')
pd.concat([x_test, y_test],axis = 1).to_csv('红酒品质分类_test.csv')

# 数据获取
train_data = pd.read_csv('红酒品质分类_train.csv')
test_data = pd.read_csv('红酒品质分类_test.csv')

x_train = train_data.iloc[:,:-1]
y_train = train_data.iloc[:,-1]
x_test = test_data.iloc[:,:-1]
y_test = test_data.iloc[:,-1]
class_weight = class_weight.compute_class_weight(class_weight = 'balanced',y=y_train,classes=y_train.unique())

#模型训练
model = XGBClassifier(n_estimators=5, objective="multi:softmax",use_label_encoder = False) # 从0开始
model.fit(x_train, y_train, sample_weight = class_weight)

y_pre = model.predict(x_test)

print(classification_report(y_test, y_pre))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB
None
   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         

XGBoostError: [18:51:57] C:\actions-runner\_work\xgboost\xgboost\src\common\quantile.cc:95: Check failed: h_weights.size() == batch.Size() (6 vs. 1279) : Invalid size of sample weight.