# 資料說明:

機器學習百日馬拉松期中考 - Enron Fraud Dataset 安隆公司詐欺案資料集

安隆公司曾是一間能源公司，2001 年破產前是世界上最大的電力、天然氣及電信公司之一。擁有上千億資產的公司於 2002 年竟然在短短幾周內宣告破產，才揭露其財報在多年以來均是造假的醜聞。在本資料集中你將會扮演偵探的角色，透過高層經理人內部的 mail 來往的情報以及薪資、股票等財務特徵，訓練出一個機器學習模型來幫忙你找到可疑的詐欺犯罪者是誰! 我們已經先幫你找到幾位犯罪者 (Person-of-Interest, poi) 與清白的員工，請利用這些訓練資料來訓練屬於自己的詐欺犯機器學習模型吧!

# 資料來源:

https://www.kaggle.com/c/2020-ml100marathon-midterm/data

# 關鍵問題:

參考:
https://docs.google.com/document/d/1XxV5_gN0E4oEYvk4ovYwepjNxCAMdAFqcSacHMrd7U0/edit?usp=sharing


# STEP1:載入套件 & 讀取資料

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('C:/Users/USER/Desktop/Github/ML100Days/Midtern Exam_Kaggle_Project/train_data.csv')
test = pd.read_csv('C:/Users/USER/Desktop/Github/ML100Days/Midtern Exam_Kaggle_Project/test_features.csv')
sample_submission = pd.read_csv('C:/Users/USER/Desktop/Github/ML100Days/Midtern Exam_Kaggle_Project/sample_submission.csv')

train_Y = train['poi']
ids = test['name']
train = train.drop(['name', 'poi'] , axis=1)
test = test.drop(['name'] , axis=1)
df = pd.concat([train,test])

print(train.shape)
print(test.shape)
print(df.shape)
df.head()

(113, 20)
(33, 20)
(146, 20)


Unnamed: 0,bonus,deferral_payments,deferred_income,director_fees,email_address,exercised_stock_options,expenses,from_messages,from_poi_to_this_person,from_this_person_to_poi,loan_advances,long_term_incentive,other,restricted_stock,restricted_stock_deferred,salary,shared_receipt_with_poi,to_messages,total_payments,total_stock_value
0,1750000.0,,-3504386.0,,ken.rice@enron.com,19794175.0,46950.0,18.0,42.0,4.0,,1617011.0,174839.0,2748364.0,,420636.0,864.0,905.0,505050.0,22542539.0
1,5600000.0,,,,jeff.skilling@enron.com,19250000.0,29336.0,108.0,88.0,30.0,,1920000.0,22122.0,6843672.0,,1111258.0,2042.0,3627.0,8682716.0,26093672.0
2,200000.0,,-4167.0,,rex.shelby@enron.com,1624396.0,22884.0,39.0,13.0,14.0,,,1573324.0,869220.0,,211844.0,91.0,225.0,2003885.0,2493616.0
3,800000.0,,,,michael.kopper@enron.com,,118134.0,,,,,602671.0,907502.0,985032.0,,224305.0,,,2652612.0,985032.0
4,1250000.0,,-262500.0,,christopher.calger@enron.com,,35818.0,144.0,199.0,25.0,,375304.0,486.0,126027.0,,240189.0,2188.0,2598.0,1639297.0,126027.0


In [3]:
#在開始胡搞瞎搞之前，先備份起來:
train_backup1 = train
test_backup = test

# STEP2:資料前處理

In [4]:
# 檢查 DataFrame 空缺值的狀態

def na_check(df_data):
    data_na = (df_data.isnull().sum() / len(df_data)) * 100
    data_na = data_na.drop(data_na[data_na == 0].index).sort_values(ascending=False)
    missing_data = pd.DataFrame({'Missing Ratio' :data_na})
    display(missing_data)
    
na_check(df)

Unnamed: 0,Missing Ratio
loan_advances,97.260274
director_fees,88.356164
restricted_stock_deferred,87.671233
deferral_payments,73.287671
deferred_income,66.438356
long_term_incentive,54.794521
bonus,43.835616
from_messages,41.09589
from_poi_to_this_person,41.09589
from_this_person_to_poi,41.09589


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 146 entries, 0 to 32
Data columns (total 20 columns):
bonus                        82 non-null float64
deferral_payments            39 non-null float64
deferred_income              49 non-null float64
director_fees                17 non-null float64
email_address                111 non-null object
exercised_stock_options      102 non-null float64
expenses                     95 non-null float64
from_messages                86 non-null float64
from_poi_to_this_person      86 non-null float64
from_this_person_to_poi      86 non-null float64
loan_advances                4 non-null float64
long_term_incentive          66 non-null float64
other                        93 non-null float64
restricted_stock             110 non-null float64
restricted_stock_deferred    18 non-null float64
salary                       95 non-null float64
shared_receipt_with_poi      86 non-null float64
to_messages                  86 non-null float64
total_payment

In [6]:
# 捨棄 email_addres 欄位
df.drop(labels = ["email_address"], axis = 1, inplace = True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 146 entries, 0 to 32
Data columns (total 19 columns):
bonus                        82 non-null float64
deferral_payments            39 non-null float64
deferred_income              49 non-null float64
director_fees                17 non-null float64
exercised_stock_options      102 non-null float64
expenses                     95 non-null float64
from_messages                86 non-null float64
from_poi_to_this_person      86 non-null float64
from_this_person_to_poi      86 non-null float64
loan_advances                4 non-null float64
long_term_incentive          66 non-null float64
other                        93 non-null float64
restricted_stock             110 non-null float64
restricted_stock_deferred    18 non-null float64
salary                       95 non-null float64
shared_receipt_with_poi      86 non-null float64
to_messages                  86 non-null float64
total_payments               125 non-null float64
total_stock_

In [7]:
#都先用0填補:
cols_to_fillin = []
for col in df.columns:
    if df[col].isnull().any()==True:
        cols_to_fillin.append(col)
        
#cols_to_fillin

for i in cols_to_fillin:
    df[i] = df[i].fillna(0)
    
na_check(df)

Unnamed: 0,Missing Ratio


In [None]:
# STEP3:EDA

In [None]:
# STEP4:特徵工程

In [8]:
print(df.nunique())
print('==================================')
print(df.info())

bonus                         42
deferral_payments             40
deferred_income               45
director_fees                 18
exercised_stock_options      102
expenses                      95
from_messages                 65
from_poi_to_this_person       57
from_this_person_to_poi       41
loan_advances                  5
long_term_incentive           53
other                         93
restricted_stock              98
restricted_stock_deferred     19
salary                        95
shared_receipt_with_poi       84
to_messages                   87
total_payments               126
total_stock_value            125
dtype: int64
<class 'pandas.core.frame.DataFrame'>
Int64Index: 146 entries, 0 to 32
Data columns (total 19 columns):
bonus                        146 non-null float64
deferral_payments            146 non-null float64
deferred_income              146 non-null float64
director_fees                146 non-null float64
exercised_stock_options      146 non-null float64
expens

看起來都沒有類別變數了

In [10]:
# 將資料最大最小化
from sklearn.preprocessing import MinMaxScaler
df = MinMaxScaler().fit_transform(df)

# STEP5:使用模型

In [11]:
# 將前述轉換完畢資料 df , 重新切成 train_X, test_X
train_num = train_Y.shape[0]
train_X = df[:train_num]
test_X = df[train_num:]

In [12]:
print(df.shape)
print(train_X.shape)
print(test_X.shape)
print(train_Y.shape)
print(test.shape)

(146, 19)
(113, 19)
(33, 19)
(113,)
(33, 20)


In [13]:
from sklearn import datasets, metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# 讀取資料集
wine = datasets.load_wine()

# 切分訓練集/測試集
#x_train, x_test, y_train, y_test = train_test_split(wine.data, wine.target, test_size=0.2, random_state=4)

# 建立模型 (使用 20 顆樹，每棵樹的最大深度為 4)
clf = RandomForestClassifier(n_estimators=20, max_depth=4, oob_score=True)

# 訓練模型
clf.fit(train_X, train_Y)

# 預測測試集
y_pred = clf.predict_proba(test_X)[:,1]

#acc = metrics.accuracy_score(test, y_pred)
#print("Accuracy: ", acc)

print(y_pred)

[0.21502592 0.29680638 0.40022305 0.15684851 0.46349843 0.21347305
 0.73525641 0.05256949 0.05599756 0.04104581 0.02582579 0.00256949
 0.01029008 0.00256949 0.32274484 0.00256949 0.03297855 0.31162351
 0.14104631 0.00824176 0.88125    0.47651235 0.01449115 0.05888672
 0.01097855 0.01812891 0.10888672 0.00461031 0.19026408 0.40643606
 0.00971235 0.04292546 0.37140345]


In [14]:
sub = pd.DataFrame({'name': ids, 'poi': y_pred})
sub['poi'] = sub['poi'].map(lambda x:1 if x>0.5 else 0) 
sub.to_csv('EnronFraud_rf2.csv', index=False)

In [16]:
# check:
sub = pd.read_csv('C:/Users/USER/Desktop/Github/ML100Days/Midtern Exam_Kaggle_Project/EnronFraud_rf2.csv')
sub

Unnamed: 0,name,poi
0,BELDEN TIMOTHY N,0
1,BOWEN JR RAYMOND M,0
2,HANNON KEVIN P,0
3,DELAINEY DAVID W,0
4,CAUSEY RICHARD A,0
5,HICKERSON GARY J,0
6,FREVERT MARK A,1
7,CHAN RONNIE,0
8,DONAHUE JR JEFFREY M,0
9,REYNOLDS LAWRENCE,0


# 0.46428