# Enron accounting scandals analysis

The Enron scandal was one of the largest corporate fraud cases in history, leading to the downfall of the company and the conviction of several executives. The graph below illustrates the significant decline in Enron's stock price between August 2000 and December 2001, providing valuable insights into the company's downfall.

![enron stock price](https://upload.wikimedia.org/wikipedia/commons/thumb/d/d0/EnronStockPriceAugust2000toJanuary2001.svg/567px-EnronStockPriceAugust2000toJanuary2001.svg.png)

Some important nodes in Enron networks, implicated or not in the case, are the following:
- kenneth.lay: Kenneth Lay (CEO)
- andrew.fastow: Andrew Fastow (CFO)
- louise.kitchen: Louise Kitchen (Managing Director)
- j.kaminski: Vincent Kaminski (Managing Director for Research, raised strong objections to the financial practices of Andrew Fastow)
- a..shankman: Jeffrey Adam Shankman (Head of the global markets division, charged with White Collard Crime)
- michelle.cash: Michelle Cash (Assistant General Counsel)

In [1]:
import re
import pandas as pd
import altair as alt
import raphtory as rty
from raphtory import algorithms
from email.utils import parsedate_to_datetime, parsedate
from datetime import timezone, datetime
from time import mktime

## Ingesting the data

In [2]:
csv = pd.read_csv('emails.csv')
emails = csv['message']

In [3]:
def extract_cc_list(text):
    cc_cut = text.split("Cc: ")
    
    if len(cc_cut) > 1:
        cc_section = cc_cut[1].split(":")[0]
        cc_emails = re.findall(r'\S+@\S+', cc_section)
        return list(map(lambda email: email.split('@')[0], cc_emails))
    else:
        return 

def extract_bcc_list(text):
    bcc_cut = text.split("Bcc: ")

    if len(bcc_cut) > 1:
        bcc_section = bcc_cut[1].split(":")[0]
        bcc_emails = re.findall(r'\S+@\S+', bcc_section)
        return list(map(lambda email: email.split('@')[0], bcc_emails))
    else:
        return

extract_sender = lambda text: text.split("From: ")[1].split("@")[0]
extract_recipient = lambda text: text.split("To: ")[1].split("@")[0]
extract_date = lambda text: text.split("Date: ")[1].split("\n")[0]

enron = pd.DataFrame()
enron['src'] = emails.apply(extract_sender)
enron['dst'] = emails.apply(extract_recipient)
enron['time'] = emails.apply(extract_date)
enron['type'] = 'direct'
enron['message'] = emails

enron_cc = enron.loc[:, ['src', 'time', 'message']]
enron_cc['dst'] = emails.apply(extract_cc_list)     
enron_cc = enron_cc.explode('dst').dropna()
enron_cc['type'] = 'cc'

enron_bcc = enron.loc[:, ['src', 'time', 'message']]
enron_bcc['dst'] = emails.apply(extract_bcc_list)     
enron_bcc = enron_bcc.explode('dst').dropna()
enron_bcc['type'] = 'bcc'

enron = pd.concat([enron, enron_cc, enron_bcc])
enron

Unnamed: 0,src,dst,time,type,message
0,phillip.allen,tim.belden,"Mon, 14 May 2001 16:39:00 -0700 (PDT)",direct,Message-ID: <18782981.1075855378110.JavaMail.e...
1,phillip.allen,john.lavorato,"Fri, 4 May 2001 13:51:00 -0700 (PDT)",direct,Message-ID: <15464986.1075855378456.JavaMail.e...
2,phillip.allen,leah.arsdall,"Wed, 18 Oct 2000 03:00:00 -0700 (PDT)",direct,Message-ID: <24216240.1075855687451.JavaMail.e...
3,phillip.allen,randall.gay,"Mon, 23 Oct 2000 06:13:00 -0700 (PDT)",direct,Message-ID: <13505866.1075863688222.JavaMail.e...
4,phillip.allen,greg.piper,"Thu, 31 Aug 2000 05:07:00 -0700 (PDT)",direct,Message-ID: <30922949.1075863688243.JavaMail.e...
...,...,...,...,...,...
517323,john.zufferli,kimberly.hillis,"Wed, 13 Jun 2001 07:19:27 -0700 (PDT)",bcc,Message-ID: <15205398.1075842036502.JavaMail.e...
517335,john.zufferli,rob.milnthorp,"Thu, 24 Jan 2002 14:00:42 -0800 (PST)",bcc,Message-ID: <20351161.1075842028492.JavaMail.e...
517352,john.zufferli,rob.milnthorp,"Fri, 18 Jan 2002 06:52:25 -0800 (PST)",bcc,Message-ID: <12099366.1075842028905.JavaMail.e...
517371,john.zufferli,jzufferli,"Mon, 7 Jan 2002 13:01:08 -0800 (PST)",bcc,Message-ID: <537849.1075842029343.JavaMail.eva...


In [4]:
raw_graph = rty.Graph()
def ingest_edge(record):
    raw_graph.add_edge(record['time'], record['src'], record['dst'], {'message': record['message']}, record['type'])
enron.apply(ingest_edge, axis=1)
g = raw_graph.window(start='2000-01-01 00:00:00', end='2003-01-01 00:00:00')
g

Graph(number_of_edges=156543, number_of_vertices=52864, earliest_time=946684800000, latest_time=1041379200000)

In [5]:
# number of edges per layers:
{layer_name: g.layer(layer_name).num_edges() for layer_name in ['direct', 'cc', 'bcc']}


{'direct': 84070, 'cc': 88297, 'bcc': 83933}

## Analysing the top nodes of the graph

In [6]:
def get_top_n_vertices(g, n):
    vertices = pd.DataFrame()
    vertices['vertex'] = g.vertices()
    vertices['in_degree'] = vertices['vertex'].apply(lambda vertex: vertex.in_degree())
    top_vertices = vertices.sort_values('in_degree', ascending=False).head(n)['vertex'].to_list()
    return top_vertices

w = g.window('2001-09-01 00:00:00', '2002-01-27 00:00:00')
# We add Andrew Fastow in any case because it's one of the most important people in the scandal
top_nodes_in_w = get_top_n_vertices(w, 15) + [w.vertex('andrew.fastow')]
[v.name() for v in top_nodes_in_w]


  vertices['vertex'] = g.vertices()


['kenneth.lay',
 'louise.kitchen',
 'richard.shapiro',
 'michelle.cash',
 'sara.shackleton',
 'd..steffes',
 'rick.buy',
 'j.kaminski',
 'jeff.dasovich',
 'legal <.taylor',
 'j..kean',
 'don.baughman',
 'alewis',
 'a..shankman',
 'sally.beck',
 'andrew.fastow']

In [7]:
events = [
    ['2001-10-22', 'share price collapse'],
    # ['2001-11-08', 'overstated profits'],
    ['2001-12-02', 'bankruptcy'],
]
events = pd.DataFrame(events, columns=['time', 'event'])
event_rules = alt.Chart(events).mark_rule(color='gray').encode(
    x='time:T'
)
labels = alt.Chart(events).mark_text(dx=3, dy=-120, color='black', align='left').encode(
    x='time:T',
    text='event'
)
event_chart = alt.layer(event_rules, labels)

In [8]:
def algo_for_vertex(algo, vertex):
    rolling = vertex.rolling(step='1 day', window='1 week')
    t_index = [datetime.fromtimestamp(w.end() / 1000) for w in rolling]
    algo_result = [algo(w) for w in rolling]
    return pd.Series(algo_result, index=t_index)

def vertices_algo_for_layer(algo, algo_name, vertices, layer):
    algo_result = pd.DataFrame()
    for vertex in vertices:
        algo_result[vertex.name()] = algo_for_vertex(algo, vertex.layer(layer))
    top_names = [vertex.name() for vertex in vertices]
    source = pd.melt(algo_result.reset_index(names='time'), id_vars=['time'], value_vars=top_names, var_name='person', value_name=algo_name)
    selection = alt.selection_multi(fields=['person'], bind='legend')
    chart = alt.Chart(source).mark_line().encode(
        x="time:T",
        y=f"{algo_name}:Q",
        color="person",
        tooltip=['person', algo_name],
        opacity=alt.condition(selection, alt.value(1.0), alt.value(0.2))
    ).properties(
        width=700,
        title=f'{layer} emails'
    ).add_selection(
        selection
    )
    return alt.layer(chart, event_chart)

In [9]:
alt.vconcat(
    vertices_algo_for_layer(lambda v: v.in_degree(), 'in_degree', top_nodes_in_w, 'direct'),
    vertices_algo_for_layer(lambda v: v.in_degree(), 'in_degree', top_nodes_in_w, 'cc'),
    vertices_algo_for_layer(lambda v: v.in_degree(), 'in_degree', top_nodes_in_w, 'bcc')
)

In [10]:
alt.vconcat(
    vertices_algo_for_layer(lambda v: v.out_degree(), 'out_degree', top_nodes_in_w, 'direct'),
    vertices_algo_for_layer(lambda v: v.out_degree(), 'out_degree', top_nodes_in_w, 'cc'),
    vertices_algo_for_layer(lambda v: v.out_degree(), 'out_degree', top_nodes_in_w, 'bcc')
)

In [11]:
alt.vconcat(
    vertices_algo_for_layer(lambda v: v.in_degree(), 'in_degree', top_nodes_in_w, 'direct'),
    vertices_algo_for_layer(lambda v: v.out_degree(), 'out_degree', top_nodes_in_w, 'direct'),
)

## Analysing the graph globally

In [12]:
def algo_for_all_layers(algo, algo_name, rolling):
    layers = ['direct', 'cc', 'bcc']
    algo_result = pd.DataFrame()
    for layer in layers:
        algo_result[layer] = [algo(w.layer(layer)) for w in rolling]
    algo_result['time'] = [datetime.fromtimestamp(w.end() / 1000) for w in rolling]
    
    source = pd.melt(algo_result, id_vars=['time'], value_vars=layers, var_name='layer', value_name=algo_name)
    selection = alt.selection_multi(fields=['layer'], bind='legend')
    chart = alt.Chart(source).mark_line().encode(
        x="time:T",
        y=f"{algo_name}:Q",
        color="layer",
        tooltip=['layer', algo_name],
        opacity=alt.condition(selection, alt.value(1.0), alt.value(0.2))
    ).properties(
        width=700,
    ).add_selection(
        selection
    )
    return alt.layer(chart, event_chart)

In [13]:
w = g.window('2001-09-01 00:00:00', '2002-01-27 00:00:00')
rolling = w.rolling(step='1 day', window='1 week')
algo_for_all_layers(algorithms.average_degree, 'average_degree', rolling)

In [14]:
w = g.window('2001-09-01 00:00:00', '2002-01-27 00:00:00')
rolling = w.rolling(step='1 day', window='1 week')
algo_for_all_layers(algorithms.global_reciprocity, 'global_reciprocity', rolling)