## 匯入資料

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
#忽略警示訊息
import warnings
warnings.filterwarnings('ignore')

data_path = '/Users/amber21_chang/Work/ML100Days/Day051-053/'
df_train = pd.read_csv(data_path + 'train_data.csv')
df_test = pd.read_csv(data_path + 'test_features.csv')

## 資料前處理

### 合併資料集

In [2]:
#合併資料集
train_Y = df_train['poi']
#test_Name = df_test['name']
train_dp = df_train.drop(['name', 'poi'] , axis=1)
test_dp = df_test.drop(['name'] , axis=1)
df = pd.concat([train_dp, test_dp])
df.head()

Unnamed: 0,bonus,deferral_payments,deferred_income,director_fees,email_address,exercised_stock_options,expenses,from_messages,from_poi_to_this_person,from_this_person_to_poi,loan_advances,long_term_incentive,other,restricted_stock,restricted_stock_deferred,salary,shared_receipt_with_poi,to_messages,total_payments,total_stock_value
0,1750000.0,,-3504386.0,,ken.rice@enron.com,19794175.0,46950.0,18.0,42.0,4.0,,1617011.0,174839.0,2748364.0,,420636.0,864.0,905.0,505050.0,22542539.0
1,5600000.0,,,,jeff.skilling@enron.com,19250000.0,29336.0,108.0,88.0,30.0,,1920000.0,22122.0,6843672.0,,1111258.0,2042.0,3627.0,8682716.0,26093672.0
2,200000.0,,-4167.0,,rex.shelby@enron.com,1624396.0,22884.0,39.0,13.0,14.0,,,1573324.0,869220.0,,211844.0,91.0,225.0,2003885.0,2493616.0
3,800000.0,,,,michael.kopper@enron.com,,118134.0,,,,,602671.0,907502.0,985032.0,,224305.0,,,2652612.0,985032.0
4,1250000.0,,-262500.0,,christopher.calger@enron.com,,35818.0,144.0,199.0,25.0,,375304.0,486.0,126027.0,,240189.0,2188.0,2598.0,1639297.0,126027.0


In [3]:
#另存數值型特徵
num_features = []
for dtype, feature in zip(df.dtypes, df.columns):
    if dtype == 'float64' or dtype == 'int64':
        num_features.append(feature)
print(f'{len(num_features)} Numeric Features : {num_features}\n')

19 Numeric Features : ['bonus', 'deferral_payments', 'deferred_income', 'director_fees', 'exercised_stock_options', 'expenses', 'from_messages', 'from_poi_to_this_person', 'from_this_person_to_poi', 'loan_advances', 'long_term_incentive', 'other', 'restricted_stock', 'restricted_stock_deferred', 'salary', 'shared_receipt_with_poi', 'to_messages', 'total_payments', 'total_stock_value']



### NA值處理

In [4]:
#NA值處理(Day012)
def na_check(df_data):
    #計算各欄位NA值比例
    data_na = (df_data.isnull().sum() / len(df_data)) * 100
    #???data_na.drop(data_na[data_na == 0].index)
    data_na = data_na.drop(data_na[data_na == 0].index).sort_values(ascending=False)
    #將有NA值的特徵欄位與NA值比例組成新的DataFrame
    missing_data = pd.DataFrame({'Missing Ratio' :data_na})
    display(missing_data)
na_check(df)

Unnamed: 0,Missing Ratio
loan_advances,97.260274
director_fees,88.356164
restricted_stock_deferred,87.671233
deferral_payments,73.287671
deferred_income,66.438356
long_term_incentive,54.794521
bonus,43.835616
from_messages,41.09589
from_poi_to_this_person,41.09589
from_this_person_to_poi,41.09589


In [5]:
#把有8成以上NA的特徵欄位直接捨棄
df = df.drop(['loan_advances', 'director_fees', 'restricted_stock_deferred'], axis=1)
#將object型態的email_address欄位捨去
df = df.drop(['email_address'], axis=1)

In [6]:
from sklearn.impute import SimpleImputer
#看完各欄位, 發先都有明顯偏態, 所以將NA值以中位數填補
imputer = SimpleImputer(strategy = 'median')
#填補器載入各欄位中位數
imputer.fit(df)
# #將中位數回填df資料中的NA值
# train = imputer.transform(train)

SimpleImputer(strategy='median')

In [7]:
#確認NA值比例
na_check(df)

Unnamed: 0,Missing Ratio
deferral_payments,73.287671
deferred_income,66.438356
long_term_incentive,54.794521
bonus,43.835616
to_messages,41.09589
shared_receipt_with_poi,41.09589
from_this_person_to_poi,41.09589
from_poi_to_this_person,41.09589
from_messages,41.09589
other,36.30137


In [8]:
#將中位數回填df資料中的NA值
df = imputer.transform(df)
type(df)
##df變成array

numpy.ndarray

In [9]:
#再轉成dataframe
df = pd.DataFrame(df)

In [10]:
#再度查看有無NA值
na_check(df)

Unnamed: 0,Missing Ratio


In [11]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,1750000.0,227449.0,-3504386.0,19794175.0,46950.0,18.0,42.0,4.0,1617011.0,174839.0,2748364.0,420636.0,864.0,905.0,505050.0,22542539.0
1,5600000.0,227449.0,-159792.0,19250000.0,29336.0,108.0,88.0,30.0,1920000.0,22122.0,6843672.0,1111258.0,2042.0,3627.0,8682716.0,26093672.0
2,200000.0,227449.0,-4167.0,1624396.0,22884.0,39.0,13.0,14.0,442035.0,1573324.0,869220.0,211844.0,91.0,225.0,2003885.0,2493616.0
3,800000.0,227449.0,-159792.0,1310813.5,118134.0,41.0,35.0,8.0,602671.0,907502.0,985032.0,224305.0,740.5,1211.0,2652612.0,985032.0
4,1250000.0,227449.0,-262500.0,1310813.5,35818.0,144.0,199.0,25.0,375304.0,486.0,126027.0,240189.0,2188.0,2598.0,1639297.0,126027.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141,1000000.0,227449.0,-159792.0,1465734.0,38559.0,40.0,23.0,8.0,369721.0,425688.0,378082.0,213625.0,1336.0,1607.0,2047593.0,1843816.0
142,1500000.0,227449.0,-159792.0,1835558.0,46950.0,92.0,28.0,23.0,554422.0,1852186.0,1293424.0,428780.0,2103.0,3187.0,4335388.0,3128982.0
143,769375.0,504610.0,-159792.0,2218275.0,46950.0,12.0,0.0,0.0,461912.0,52382.0,451740.0,259996.0,23.0,169.0,966522.0,2218275.0
144,200000.0,204075.0,-159792.0,2549361.0,57727.0,41.0,35.0,8.0,175000.0,2630.0,514847.0,236457.0,740.5,1211.0,875889.0,3064208.0


In [12]:
df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
count,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0
mean,1670735.0,605488.6,-488925.4,4577776.0,87148.61,375.452055,52.609589,27.575342,906894.9,604447.2,1860645.0,456631.9,997.30137,1719.260274,4509042.0,5997096.0
std,8047011.0,2716204.0,2362197.0,26013510.0,430594.1,1437.174998,68.210867,78.357081,4011885.0,3679386.0,10883690.0,2191873.0,927.488807,2022.788673,26911500.0,36224260.0
min,70000.0,-102500.0,-27992890.0,3285.0,148.0,12.0,0.0,0.0,69223.0,2.0,-2604490.0,477.0,2.0,57.0,148.0,-44093.0
25%,712500.0,227449.0,-159792.0,757865.0,33848.5,36.0,25.75,6.0,442035.0,2686.5,360528.0,239800.5,591.5,904.25,554467.5,612908.0
50%,769375.0,227449.0,-159792.0,1310814.0,46950.0,41.0,35.0,8.0,442035.0,52382.0,451740.0,259996.0,740.5,1211.0,1101393.0,1102872.0
75%,800000.0,227449.0,-159792.0,1714221.0,53740.75,51.25,40.75,13.75,442035.0,150606.5,814528.0,270850.5,893.5,1585.75,1968287.0,2319991.0
max,97343620.0,32083400.0,-833.0,311764000.0,5235198.0,14368.0,528.0,609.0,48521930.0,42667590.0,130322300.0,26704230.0,5521.0,15149.0,309886600.0,434509500.0


### 處理離群值

In [13]:
print(df[0].mean(), df[0].std())

1670734.506849315 8047011.284904227


In [14]:
outlier_under0 = df[0].mean() - 3* df[0].std()
outlier_upper0 = df[0].mean() + 3* df[0].std()
#clip-比outlier_under0小用outlier_under0取代, 比outlier_upper0大用outlier_upper0取代
df[0] = df[0].clip(outlier_under0, outlier_upper0)

In [15]:
print(df[0].mean(), df[0].std())

1180790.3243942603 2325019.412694041


In [16]:
print(df[1].mean(), df[1].std())

605488.595890411 2716204.2385775135


In [17]:
outlier_under1 = df[1].mean() - 3* df[1].std()
outlier_upper1 = df[1].mean() + 3* df[1].std()
df[1] = df[1].clip(outlier_under1, outlier_upper1)

In [18]:
print(df[1].mean(), df[1].std())

445698.9062439928 988169.4899051384


In [19]:
print(df[2].mean(), df[2].std())

-488925.38356164383 2362196.9055702807


In [20]:
outlier_under2 = df[2].mean() - 3* df[2].std()
outlier_upper2 = df[2].mean() + 3* df[2].std()
df[2] = df[2].clip(outlier_under2, outlier_upper2)

In [21]:
print(df[2].mean(), df[2].std())

-349080.35000186635 830424.2242718436


In [22]:
print(df[3].mean(), df[3].std())

4577775.883561644 26013506.559099928


In [23]:
outlier_under3 = df[3].mean() - 3* df[3].std()
outlier_upper3 = df[3].mean() + 3* df[3].std()
df[3] = df[3].clip(outlier_under3, outlier_upper3)

In [24]:
print(df[3].mean(), df[3].std())

3008284.757266174 8089657.160968359


In [25]:
print(df[4].mean(), df[4].std())

87148.6095890411 430594.121648656


In [26]:
outlier_under4 = df[4].mean() - 3* df[4].std()
outlier_upper4 = df[4].mean() + 3* df[4].std()
df[4] = df[4].clip(outlier_under4, outlier_upper4)

In [27]:
print(df[4].mean(), df[4].std())

60735.82174339047 115939.12020805883


In [28]:
print(df[5].mean(), df[5].std())

375.45205479452056 1437.1749975376947


In [29]:
outlier_under5 = df[5].mean() - 3* df[5].std()
outlier_upper5 = df[5].mean() + 3* df[5].std()
df[5] = df[5].clip(outlier_under5, outlier_upper5)

In [30]:
print(df[5].mean(), df[5].std())

294.9517403754466 827.0626271157283


In [31]:
print(df[6].mean(), df[6].std())

52.60958904109589 68.21086709736782


In [32]:
outlier_under6 = df[6].mean() - 3* df[6].std()
outlier_upper6 = df[6].mean() + 3* df[6].std()
df[6] = df[6].clip(outlier_under6, outlier_upper6)

In [33]:
print(df[6].mean(), df[6].std())

50.427975210043826 56.81196505486255


In [34]:
print(df[7].mean(), df[7].std())

27.575342465753426 78.35708146299956


In [35]:
outlier_under7 = df[7].mean() - 3* df[7].std()
outlier_upper7 = df[7].mean() + 3* df[7].std()
df[7] = df[7].clip(outlier_under7, outlier_upper7)

In [36]:
print(df[7].mean(), df[7].std())

22.4903174480754 50.15912740704077


In [37]:
print(df[8].mean(), df[8].std())

906894.904109589 4011885.322810267


In [38]:
outlier_under8 = df[8].mean() - 3* df[8].std()
outlier_upper8 = df[8].mean() + 3* df[8].std()
df[8] = df[8].clip(outlier_under8, outlier_upper8)

In [39]:
print(df[8].mean(), df[8].std())

663200.5402228794 1182640.5332582819


In [40]:
print(df[9].mean(), df[9].std())

604447.1780821917 3679386.153667983


In [41]:
outlier_under9 = df[9].mean() - 3* df[9].std()
outlier_upper9 = df[9].mean() + 3* df[9].std()
df[9] = df[9].clip(outlier_under9, outlier_upper9)

In [42]:
print(df[9].mean(), df[9].std())

391947.2920485352 1459648.5044118532


In [43]:
print(df[10].mean(), df[10].std())

1860644.9657534247 10883692.955060324


In [44]:
outlier_under10 = df[10].mean() - 3* df[10].std()
outlier_upper10 = df[10].mean() + 3* df[10].std()
df[10] = df[10].clip(outlier_under10, outlier_upper10)

In [45]:
print(df[10].mean(), df[10].std())

1204408.1495269479 3400571.915732899


In [46]:
print(df[11].mean(), df[11].std())

456631.87671232875 2191873.3043484525


In [47]:
outlier_under11 = df[11].mean() - 3* df[11].std()
outlier_upper11 = df[11].mean() + 3* df[11].std()
df[11] = df[11].clip(outlier_under11, outlier_upper11)

In [48]:
print(df[11].mean(), df[11].std())

321892.3067791622 577011.1645433352


In [49]:
print(df[12].mean(), df[12].std())

997.3013698630137 927.4888074042616


In [50]:
outlier_under12 = df[12].mean() - 3* df[12].std()
outlier_upper12 = df[12].mean() + 3* df[12].std()
df[12] = df[12].clip(outlier_under12, outlier_upper12)

In [51]:
print(df[12].mean(), df[12].std())

978.048432659611 852.5591402294692


In [52]:
print(df[13].mean(), df[13].std())

1719.2602739726028 2022.788672735956


In [53]:
outlier_under13 = df[13].mean() - 3* df[13].std()
outlier_upper13 = df[13].mean() + 3* df[13].std()
df[13] = df[13].clip(outlier_under13, outlier_upper13)

In [54]:
print(df[13].mean(), df[13].std())

1629.8870217035746 1563.8077608132212


In [55]:
print(df[14].mean(), df[14].std())

4509041.534246575 26911497.027707174


In [56]:
outlier_under14 = df[14].mean() - 3* df[14].std()
outlier_upper14 = df[14].mean() + 3* df[14].std()
df[14] = df[14].clip(outlier_under14, outlier_upper14)

In [57]:
print(df[14].mean(), df[14].std())

2844936.6522927135 10027021.554808505


In [58]:
print(df[15].mean(), df[15].std())

5997096.5 36224264.89428919


In [59]:
outlier_under15 = df[15].mean() - 3* df[15].std()
outlier_upper15 = df[15].mean() + 3* df[15].std()
df[15] = df[15].clip(outlier_under15, outlier_upper15)

In [60]:
print(df[15].mean(), df[15].std())

3806414.172485394 11065584.47118006


In [61]:
df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
count,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0
mean,1180790.0,445698.9,-349080.4,3008285.0,60735.82,294.95174,50.427975,22.490317,663200.5,391947.3,1204408.0,321892.3,978.048433,1629.887022,2844937.0,3806414.0
std,2325019.0,988169.5,830424.2,8089657.0,115939.1,827.062627,56.811965,50.159127,1182641.0,1459649.0,3400572.0,577011.2,852.55914,1563.807761,10027020.0,11065580.0
min,70000.0,-102500.0,-7575516.0,3285.0,148.0,12.0,0.0,0.0,69223.0,2.0,-2604490.0,477.0,2.0,57.0,148.0,-44093.0
25%,712500.0,227449.0,-159792.0,757865.0,33848.5,36.0,25.75,6.0,442035.0,2686.5,360528.0,239800.5,591.5,904.25,554467.5,612908.0
50%,769375.0,227449.0,-159792.0,1310814.0,46950.0,41.0,35.0,8.0,442035.0,52382.0,451740.0,259996.0,740.5,1211.0,1101393.0,1102872.0
75%,800000.0,227449.0,-159792.0,1714221.0,53740.75,51.25,40.75,13.75,442035.0,150606.5,814528.0,270850.5,893.5,1585.75,1968287.0,2319991.0
max,25811770.0,8754101.0,-833.0,82618300.0,1378931.0,4686.977047,257.24219,262.646587,12942550.0,11642610.0,34511720.0,7032252.0,3779.767792,7787.626292,85243530.0,114669900.0


### 去除偏態

In [62]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer
import copy
df_fixed = copy.deepcopy(df)
mms = MinMaxScaler()
x_mms = mms.fit_transform(df_fixed)

In [63]:
#boxcox轉換
from scipy import stats
import copy
df_fixed[0] = stats.boxcox(df_fixed[0])[0]
#df_fixed[1] = stats.boxcox(df_fixed[1])[1]
#df_fixed[2] = stats.boxcox(df_fixed[2])[2]
#df_fixed[3] = stats.boxcox(df_fixed[3])[3]
#df_fixed[4] = stats.boxcox(df_fixed[4])[4]
#df_fixed[5] = stats.boxcox(df_fixed[5])[5]
#df_fixed[6] = stats.boxcox(df_fixed[6])[6]
#df_fixed[7] = stats.boxcox(df_fixed[7])[7]
#df_fixed[8] = stats.boxcox(df_fixed[8])[8]
#df_fixed[9] = stats.boxcox(df_fixed[9])[9]
#df_fixed[10] = stats.boxcox(df_fixed[10])[10]
#df_fixed[11] = stats.boxcox(df_fixed[11])[11]
#df_fixed[12] = stats.boxcox(df_fixed[12])[12]
#df_fixed[13] = stats.boxcox(df_fixed[13])[13]
#df_fixed[14] = stats.boxcox(df_fixed[14])[14]
#df_fixed[15] = stats.boxcox(df_fixed[15])[15]
##只有0可以順利做boxcox轉換
##1, 2, 10, 15有負數或0
##3, 4, 8, 9, 11, 14-IndexError: tuple index out of range

In [64]:
#次數以log1p轉換
#5=from_messages, 6=from_poi_to_this_person, 7=from_this_person_to_poi
#12=shared_receipt_with_poi, 13=to_messages
df_fixed[5] = np.log1p(df_fixed[5])[5]
df_fixed[6] = np.log1p(df_fixed[6])[6]
df_fixed[7] = np.log1p(df_fixed[7])[7]
df_fixed[12] = np.log1p(df_fixed[12])[12]
df_fixed[13] = np.log1p(df_fixed[13])[13]

In [65]:
#將做過轉換的欄位繪圖查看
# import seaborn as sns
# import matplotlib.pyplot as plt
# train_num = train_Y.shape[0]
# sns.distplot(df_fixed[5][:train_num])

# plt.show()

# df_fixed = MMEncoder.fit_transform(df_fixed)
# train_X = df_fixed[:train_num]
# estimator = LogisticRegression()
# print(cross_val_score(estimator, train_X, train_Y, cv=5).mean())
##RuntimeError: Selected KDE bandwidth is 0. Cannot estiamte density.

### 直接建模

In [66]:
train_num = train_Y.shape[0]
x_train = df_fixed[:train_num]
#test_Name = df_test['name']

In [67]:
test_Name = df_test['name']

In [68]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_train, train_Y, test_size=0.2)

#### DecisionTree

In [69]:
#建模
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(random_state=1)
#訓練模型
dtc.fit(x_train, y_train)
#預測測試集
y_pred1 = dtc.predict(x_test)
#計算精準度
from sklearn import metrics
acc = metrics.accuracy_score(y_test, y_pred1)
print("Accuracy: ", acc)
auc = metrics.roc_auc_score(y_test, y_pred1) # 使用 roc_auc_score 來評估。 **這邊特別注意 y_pred 必須要放機率值進去!**
print("AUC: ", auc)
##Accuracy:  0.8695652173913043
##AUC:  0.7833333333333333

Accuracy:  0.782608695652174
AUC:  0.6428571428571428


In [70]:
x_test_pred = df_fixed[train_num:]

In [71]:
#預測test資料集
y_pred_dtc = dtc.predict_proba(x_test_pred)
#將y_pred由array轉為Dataframe
y_pred_dtc = pd.DataFrame(y_pred_dtc)

In [72]:
#彙整y_pred_pd結果
import numpy as np
y_pred_dtc.columns = ['Not poi','poi']
y_pred_dtc['Name'] = df_test['name']
y_pred_dtc = y_pred_dtc[['Name', 'poi']]
print(y_pred_dtc)

                    Name  poi
0       BELDEN TIMOTHY N  0.0
1     BOWEN JR RAYMOND M  0.0
2         HANNON KEVIN P  1.0
3       DELAINEY DAVID W  0.0
4       CAUSEY RICHARD A  0.0
5       HICKERSON GARY J  1.0
6         FREVERT MARK A  1.0
7            CHAN RONNIE  0.0
8   DONAHUE JR JEFFREY M  0.0
9      REYNOLDS LAWRENCE  0.0
10      HORTON STANLEY C  0.0
11         LEWIS RICHARD  0.0
12    SHERRICK JEFFREY B  0.0
13        MEYER JEROME J  0.0
14   DERRICK JR. JAMES V  1.0
15              PIRO JIM  0.0
16   DETMERING TIMOTHY J  0.0
17     KISHKILL JOSEPH G  0.0
18        BAY FRANKLIN R  0.0
19        PRENTICE JAMES  0.0
20                 TOTAL  0.0
21        FALLON JAMES B  0.0
22            METTS MARK  0.0
23         WODRASKA JOHN  0.0
24   MORDAUNT KRISTINA M  0.0
25      CARTER REBECCA C  0.0
26      BHATNAGAR SANJAY  0.0
27      WALTERS GARETH W  0.0
28       BIBI PHILIPPE A  0.0
29       SHERRIFF JOHN R  0.0
30          GIBBS DANA R  0.0
31        LINDHOLM TOD A  0.0
32       M

In [73]:
#y_pred_dtc.to_csv('Midterm_mmsboxlogDecisionTree.csv', index=False)

#### RandomForest

In [74]:
##RandomForestClassifier()
#建模
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=3)
#訓練模型
rfc.fit(x_train, y_train)
#預測測試集
y_pred2 = rfc.predict(x_test)
#計算精準度
from sklearn import metrics
acc = metrics.accuracy_score(y_test, y_pred2)
print("Accuracy: ", acc)
auc = metrics.roc_auc_score(y_test, y_pred2) # 使用 roc_auc_score 來評估。 **這邊特別注意 y_pred 必須要放機率值進去!**
print("AUC: ", auc)
##Accuracy:  0.9130434782608695
##AUC:  0.6666666666666666

Accuracy:  0.7391304347826086
AUC:  0.5714285714285714


In [75]:
x_test_pred = df_fixed[train_num:]

In [76]:
#預測test資料集
y_pred_rfc = rfc.predict_proba(x_test_pred)
#將y_pred由array轉為Dataframe
y_pred_rfc = pd.DataFrame(y_pred_rfc)

In [77]:
#彙整y_pred_pd結果
import numpy as np
y_pred_rfc.columns = ['Not poi','poi']
y_pred_rfc['Name'] = df_test['name']
y_pred_rfc = y_pred_rfc[['Name', 'poi']]
print(y_pred_rfc)

                    Name   poi
0       BELDEN TIMOTHY N  0.20
1     BOWEN JR RAYMOND M  0.18
2         HANNON KEVIN P  0.26
3       DELAINEY DAVID W  0.01
4       CAUSEY RICHARD A  0.08
5       HICKERSON GARY J  0.18
6         FREVERT MARK A  0.40
7            CHAN RONNIE  0.06
8   DONAHUE JR JEFFREY M  0.07
9      REYNOLDS LAWRENCE  0.16
10      HORTON STANLEY C  0.00
11         LEWIS RICHARD  0.00
12    SHERRICK JEFFREY B  0.00
13        MEYER JEROME J  0.03
14   DERRICK JR. JAMES V  0.23
15              PIRO JIM  0.07
16   DETMERING TIMOTHY J  0.06
17     KISHKILL JOSEPH G  0.04
18        BAY FRANKLIN R  0.09
19        PRENTICE JAMES  0.00
20                 TOTAL  0.55
21        FALLON JAMES B  0.07
22            METTS MARK  0.00
23         WODRASKA JOHN  0.03
24   MORDAUNT KRISTINA M  0.01
25      CARTER REBECCA C  0.02
26      BHATNAGAR SANJAY  0.09
27      WALTERS GARETH W  0.03
28       BIBI PHILIPPE A  0.04
29       SHERRIFF JOHN R  0.08
30          GIBBS DANA R  0.00
31      

In [78]:
#y_pred_rfc.to_csv('Midterm_mmsboxlogRandomForest.csv', index=False)

#### GradientBoosting

In [79]:
##GradientBoostingClassifier()
#建模
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(random_state=5)
#訓練模型
gbc.fit(x_train, y_train)
#預測測試集
y_pred3 = gbc.predict(x_test)
#計算精準度
from sklearn import metrics
acc = metrics.accuracy_score(y_test, y_pred3)
print("Accuracy: ", acc)
auc = metrics.roc_auc_score(y_test, y_pred3) # 使用 roc_auc_score 來評估。 **這邊特別注意 y_pred 必須要放機率值進去!**
print("AUC: ", auc)
##Accuracy:  0.8695652173913043
##AUC:  0.6416666666666666

Accuracy:  0.782608695652174
AUC:  0.6428571428571428


In [80]:
x_test_pred = df_fixed[train_num:]

In [81]:
#預測test資料集
y_pred_gbc = gbc.predict_proba(x_test_pred)
#將y_pred由array轉為Dataframe
y_pred_gbc = pd.DataFrame(y_pred_gbc)

In [82]:
#彙整y_pred_pd結果
import numpy as np
y_pred_gbc.columns = ['Not poi','poi']
y_pred_gbc['Name'] = df_test['name']
y_pred_gbc = y_pred_gbc[['Name', 'poi']]
print(y_pred_gbc)

                    Name       poi
0       BELDEN TIMOTHY N  0.000569
1     BOWEN JR RAYMOND M  0.000558
2         HANNON KEVIN P  0.992778
3       DELAINEY DAVID W  0.000173
4       CAUSEY RICHARD A  0.000083
5       HICKERSON GARY J  0.910444
6         FREVERT MARK A  0.215447
7            CHAN RONNIE  0.003706
8   DONAHUE JR JEFFREY M  0.000039
9      REYNOLDS LAWRENCE  0.001006
10      HORTON STANLEY C  0.000053
11         LEWIS RICHARD  0.000039
12    SHERRICK JEFFREY B  0.000039
13        MEYER JEROME J  0.000324
14   DERRICK JR. JAMES V  0.944765
15              PIRO JIM  0.000153
16   DETMERING TIMOTHY J  0.000148
17     KISHKILL JOSEPH G  0.000315
18        BAY FRANKLIN R  0.000320
19        PRENTICE JAMES  0.000039
20                 TOTAL  0.465848
21        FALLON JAMES B  0.000173
22            METTS MARK  0.000039
23         WODRASKA JOHN  0.000035
24   MORDAUNT KRISTINA M  0.000482
25      CARTER REBECCA C  0.000045
26      BHATNAGAR SANJAY  0.000127
27      WALTERS GARE

In [83]:
#y_pred_gbc.to_csv('Midterm_mmsboxlogGradientBoosting.csv', index=False)

### Grid Search