# 資料說明:

機器學習百日馬拉松期中考 - Enron Fraud Dataset 安隆公司詐欺案資料集

安隆公司曾是一間能源公司，2001 年破產前是世界上最大的電力、天然氣及電信公司之一。擁有上千億資產的公司於 2002 年竟然在短短幾周內宣告破產，才揭露其財報在多年以來均是造假的醜聞。在本資料集中你將會扮演偵探的角色，透過高層經理人內部的 mail 來往的情報以及薪資、股票等財務特徵，訓練出一個機器學習模型來幫忙你找到可疑的詐欺犯罪者是誰! 我們已經先幫你找到幾位犯罪者 (Person-of-Interest, poi) 與清白的員工，請利用這些訓練資料來訓練屬於自己的詐欺犯機器學習模型吧!

# 資料來源:

https://www.kaggle.com/c/2020-ml100marathon-midterm/data

# 關鍵問題:

參考:
https://docs.google.com/document/d/1XxV5_gN0E4oEYvk4ovYwepjNxCAMdAFqcSacHMrd7U0/edit?usp=sharing


# STEP1:載入套件 & 讀取資料

In [62]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [63]:
train = pd.read_csv('C:/Users/USER/Desktop/Github/ML100Days/Midtern Exam_Kaggle_Project/train_data.csv')
test = pd.read_csv('C:/Users/USER/Desktop/Github/ML100Days/Midtern Exam_Kaggle_Project/test_features.csv')
sample_submission = pd.read_csv('C:/Users/USER/Desktop/Github/ML100Days/Midtern Exam_Kaggle_Project/sample_submission.csv')

In [64]:
train_Y = train['poi']
ids = test['name']  #最後繳交檔案時需要用到
#train = train.drop(['name', 'poi'] , axis=1)
#test = test.drop(['name'] , axis=1)
df = pd.concat([train,test])

print(train.shape)
print(test.shape)
print(df.shape)
df.head()

(113, 22)
(33, 21)
(146, 22)


Unnamed: 0,bonus,deferral_payments,deferred_income,director_fees,email_address,exercised_stock_options,expenses,from_messages,from_poi_to_this_person,from_this_person_to_poi,...,name,other,poi,restricted_stock,restricted_stock_deferred,salary,shared_receipt_with_poi,to_messages,total_payments,total_stock_value
0,1750000.0,,-3504386.0,,ken.rice@enron.com,19794175.0,46950.0,18.0,42.0,4.0,...,RICE KENNETH D,174839.0,True,2748364.0,,420636.0,864.0,905.0,505050.0,22542539.0
1,5600000.0,,,,jeff.skilling@enron.com,19250000.0,29336.0,108.0,88.0,30.0,...,SKILLING JEFFREY K,22122.0,True,6843672.0,,1111258.0,2042.0,3627.0,8682716.0,26093672.0
2,200000.0,,-4167.0,,rex.shelby@enron.com,1624396.0,22884.0,39.0,13.0,14.0,...,SHELBY REX,1573324.0,True,869220.0,,211844.0,91.0,225.0,2003885.0,2493616.0
3,800000.0,,,,michael.kopper@enron.com,,118134.0,,,,...,KOPPER MICHAEL J,907502.0,True,985032.0,,224305.0,,,2652612.0,985032.0
4,1250000.0,,-262500.0,,christopher.calger@enron.com,,35818.0,144.0,199.0,25.0,...,CALGER CHRISTOPHER F,486.0,True,126027.0,,240189.0,2188.0,2598.0,1639297.0,126027.0


In [65]:
#在開始胡搞瞎搞之前，先備份起來:
train_backup1 = train
test_backup1 = test
df_backup1 = df

# STEP2:資料前處理

In [66]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 146 entries, 0 to 32
Data columns (total 22 columns):
bonus                        82 non-null float64
deferral_payments            39 non-null float64
deferred_income              49 non-null float64
director_fees                17 non-null float64
email_address                111 non-null object
exercised_stock_options      102 non-null float64
expenses                     95 non-null float64
from_messages                86 non-null float64
from_poi_to_this_person      86 non-null float64
from_this_person_to_poi      86 non-null float64
loan_advances                4 non-null float64
long_term_incentive          66 non-null float64
name                         146 non-null object
other                        93 non-null float64
poi                          113 non-null object
restricted_stock             110 non-null float64
restricted_stock_deferred    18 non-null float64
salary                       95 non-null float64
shared_receip

In [67]:
# Fill in the NaN payment and stock values with zero 
df['total_payments'] = df['total_payments'].fillna(0)
df['exercised_stock_options'] = df['exercised_stock_options'].fillna(0)

In [68]:
df_poi = df[df['poi'] == True]
df_nonpoi = df[df['poi']==False]

In [69]:
#fill in that value with the mean value of ‘to_messages’ for a POI
df_poi['to_messages'] = df_poi['to_messages'].mean()

#fill in that value with the mean value of ‘to_messages’ for a Non-POI
df_nonpoi['to_messages'] = df_nonpoi['to_messages'].mean()

df = df_poi.append(df_nonpoi)
df.head()

Unnamed: 0,bonus,deferral_payments,deferred_income,director_fees,email_address,exercised_stock_options,expenses,from_messages,from_poi_to_this_person,from_this_person_to_poi,...,name,other,poi,restricted_stock,restricted_stock_deferred,salary,shared_receipt_with_poi,to_messages,total_payments,total_stock_value
0,1750000.0,,-3504386.0,,ken.rice@enron.com,19794175.0,46950.0,18.0,42.0,4.0,...,RICE KENNETH D,174839.0,True,2748364.0,,420636.0,864.0,1995.666667,505050.0,22542539.0
1,5600000.0,,,,jeff.skilling@enron.com,19250000.0,29336.0,108.0,88.0,30.0,...,SKILLING JEFFREY K,22122.0,True,6843672.0,,1111258.0,2042.0,1995.666667,8682716.0,26093672.0
2,200000.0,,-4167.0,,rex.shelby@enron.com,1624396.0,22884.0,39.0,13.0,14.0,...,SHELBY REX,1573324.0,True,869220.0,,211844.0,91.0,1995.666667,2003885.0,2493616.0
3,800000.0,,,,michael.kopper@enron.com,0.0,118134.0,,,,...,KOPPER MICHAEL J,907502.0,True,985032.0,,224305.0,,1995.666667,2652612.0,985032.0
4,1250000.0,,-262500.0,,christopher.calger@enron.com,0.0,35818.0,144.0,199.0,25.0,...,CALGER CHRISTOPHER F,486.0,True,126027.0,,240189.0,2188.0,1995.666667,1639297.0,126027.0


In [70]:
# 檢查 DataFrame 空缺值的狀態

def na_check(df_data):
    data_na = (df_data.isnull().sum() / len(df_data)) * 100
    data_na = data_na.drop(data_na[data_na == 0].index).sort_values(ascending=False)
    missing_data = pd.DataFrame({'Missing Ratio' :data_na})
    display(missing_data)
    
na_check(df)

Unnamed: 0,Missing Ratio
loan_advances,98.230088
restricted_stock_deferred,91.150442
director_fees,88.495575
deferral_payments,75.221239
deferred_income,69.911504
long_term_incentive,56.637168
bonus,46.017699
from_poi_to_this_person,42.477876
from_messages,42.477876
from_this_person_to_poi,42.477876


In [72]:
#離群值處理
IQR = df.quantile(q=0.75) - df.quantile(q=0.25)
first_quartile = df.quantile(q=0.25)
third_quartile = df.quantile(q=0.75)

IQR

bonus                          550000.00
deferral_payments              757637.75
deferred_income                525620.00
director_fees                   11242.00
exercised_stock_options       1624396.00
expenses                        40341.00
from_messages                     196.00
from_poi_to_this_person            78.00
from_this_person_to_poi            27.00
loan_advances                40562500.00
long_term_incentive            556809.00
other                          361124.00
restricted_stock               697568.50
restricted_stock_deferred      311668.50
salary                          82468.00
shared_receipt_with_poi          1724.00
to_messages                         0.00
total_payments                1662530.00
total_stock_value             2072464.25
dtype: float64

In [78]:
outliers = df[(df>(third_quartile + 1.5*IQR) ) | (df<(first_quartile - 1.5*IQR) )].count(axis=1)
outliers.sort_values(axis=0, ascending=False, inplace=True)
outliers.head(20)

8      9
1      8
33     7
0      6
58     6
47     6
14     5
78     4
75     4
26     4
87     4
12     4
6      4
42     3
21     3
71     3
5      3
86     3
112    3
7      2
dtype: int64

In [80]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
bonus,61.0,1147436.0,1505189.0,70000.0,450000.0,750000.0,1000000.0,8000000.0
deferral_payments,28.0,634437.4,860364.6,-102500.0,76567.5,195190.0,834205.2,2964506.0
deferred_income,34.0,-462566.4,809539.2,-3504386.0,-552703.2,-117534.0,-27083.25,-1042.0
director_fees,13.0,89397.85,41143.39,3285.0,101250.0,108579.0,112492.0,125034.0
exercised_stock_options,113.0,2139748.0,5251192.0,0.0,0.0,493489.0,1624396.0,34348380.0
expenses,73.0,51040.55,47596.68,148.0,18834.0,41953.0,59175.0,228763.0
from_messages,65.0,711.3231,2074.498,12.0,19.0,45.0,215.0,14368.0
from_poi_to_this_person,65.0,64.8,91.86321,0.0,10.0,28.0,88.0,528.0
from_this_person_to_poi,65.0,40.09231,88.90141,0.0,0.0,7.0,27.0,411.0
loan_advances,2.0,40962500.0,57364040.0,400000.0,20681250.0,40962500.0,61243750.0,81525000.0


# STEP3:EDA

# STEP4:特徵工程

# STEP5:使用模型

In [11]:
# 將前述轉換完畢資料 df , 重新切成 train_X, test_X
train_num = train_Y.shape[0]
train_X = df[:train_num]
test_X = df[train_num:]

In [12]:
print(df.shape)
print(train_X.shape)
print(test_X.shape)
print(train_Y.shape)
print(test.shape)

(146, 19)
(113, 19)
(33, 19)
(113,)
(33, 20)


In [14]:
sub = pd.DataFrame({'name': ids, 'poi': y_pred})
sub['poi'] = sub['poi'].map(lambda x:1 if x>0.5 else 0) 
sub.to_csv('EnronFraud_rf2.csv', index=False)

In [16]:
# check:
sub = pd.read_csv('C:/Users/USER/Desktop/Github/ML100Days/Midtern Exam_Kaggle_Project/EnronFraud_rf2.csv')
sub

Unnamed: 0,name,poi
0,BELDEN TIMOTHY N,0
1,BOWEN JR RAYMOND M,0
2,HANNON KEVIN P,0
3,DELAINEY DAVID W,0
4,CAUSEY RICHARD A,0
5,HICKERSON GARY J,0
6,FREVERT MARK A,1
7,CHAN RONNIE,0
8,DONAHUE JR JEFFREY M,0
9,REYNOLDS LAWRENCE,0


# 0.46428