# PPE Contracts, PSCs and Companies Analysis Notebook

In [None]:
# import dependencies
import pandas as pd
import time
import json
import matplotlib.pyplot as plt
plt.rcParams['font.size'] = '16'

In [None]:
# ingest all data and concatenate partitions
pscs = []
for line in open('','r'):
    pscs.append(json.loads(line))
for line in open('', 'r'):
    pscs.append(json.loads(line))
for line in open('', 'r'):
    pscs.append(json.loads(line))
for line in open('', 'r'):
    pscs.append(json.loads(line))
    
pscs

In [None]:
# Drop window column as empty
pscRawDataframe = pd.DataFrame.from_dict(pscs)
pscRawDataframe.drop('window', inplace=True, axis=1)
pscRawDataframe

In [None]:
# Split up row column to its respective columns
splitUpDataframe = pd.DataFrame(pscRawDataframe['row'].tolist(), columns=['psc','time-company-share', 'other_pscs','company_number'])
cleanDataframe=splitUpDataframe.dropna()
cleanDataframe

In [None]:
# extract other_pscs column out of dictionary format
new_rows=[]

for i,row in cleanDataframe.iterrows():
    for key, value in row['other_pscs'].items():
        temp_row={
            'psc': row['psc'],
            'time-company-share': row['time-company-share'],
            'company_number': row['company_number'],
            'other_psc': key,
            'other_psc_count': value
        }
        new_rows.append(temp_row)  
pscdf = pd.DataFrame(new_rows)

In [None]:
# Get company number out of brackets
pscdf = pscdf.explode('company_number')

In [None]:
# extract time-company-share column into its own separate columns
new_rows=[]

for i,row in pscdf.iterrows():
    company_number = row['company_number']
    foundtime = None
    for triplet in row['time-company-share']:
        timeA, companyA, shareA = triplet.split("-")
        if (companyA == company_number):
            foundtime = timeA
        temp_row={
            'psc': row['psc'],
            'time': foundtime,
            'company_number': row['company_number'],
            'other_psc': row['other_psc'],
            'share_ownership': shareA,
            'other_psc_count': row['other_psc_count']
        }
        new_rows.append(temp_row)  
finaldf = pd.DataFrame(new_rows).drop_duplicates()
finaldf

In [None]:
# drop None values
finaldf = finaldf.dropna()
finaldf

In [None]:
# Remove duplicates regardless of order
sortedFinalDf = finaldf[~pd.DataFrame(np.sort(finaldf[['psc','time','company_number','other_psc']], axis=1), index=finaldf.index).duplicated()]
sortedFinalDf

In [None]:
# Get PSCs with greater than 5 companies
greaterThanFiveDf = df[df.groupby("company_number")['psc'].transform('size') > 5]
greaterThanFiveDf

In [None]:
# Get PSCs with greater than 1 company
greaterThanOneDf = df[df.groupby("company_number")['psc'].transform('size') > 1]
greaterThanOneDf

In [None]:
# Convert time column from epoch to datetime
greaterThanOneDf['time']=pd.to_datetime(greaterThanOneDf['time'], unit='ms')\
                 .dt.tz_localize('UTC' )\
                 .dt.tz_convert('Europe/London')

In [None]:
# Remove time from date
greaterThanOneDf['date'] = pd.to_datetime(greaterThanOneDf['time']).dt.date

In [None]:
# Drop other_psc_count and time column as this is not needed for our visualisation
greaterThanOneDf = greaterThanOneDf.drop('other_psc_count', axis=1)
greaterThanOneDf = greaterThanOneDf.drop('time', axis=1)
greaterThanOneDf

In [None]:
# Convert date column to string type
greaterThanOneDf['date'] = greaterThanOneDf['date'].astype(str)

In [None]:
greaterThanOneDf['company_number'] = greaterThanOneDf['company_number'].replace(['SC399884'], 'Pursuit Marketing Limited')

In [None]:
# Using pyvis to visualise a psc to company network graph with edge weights labelled with date and share ownership, 
# thickness of edge corresponds to share ownership percentage.

from pyvis.network import Network
nt = Network('800px', '800px',bgcolor='#white', font_color='black')
pscs = greaterThanOneDf['psc']
otherpscs = greaterThanOneDf['other_psc']
companies = greaterThanOneDf['company_number']
share = greaterThanOneDf['share_ownership']
date = greaterThanOneDf['date']

edge_data = zip(pscs, otherpscs, share, companies, date)

for e in edge_data:
    src = e[0]
    dst = e[1]
    w = e[2]
    newdst = e[3]
    date = e[4]
    
    nt.add_node(src, src, title=src,shape='image', image ="https://www.seekpng.com/png/small/846-8465978_critres-de-choix-pour-la-solution-de-cration.png")
    nt.add_node(dst, dst, title=dst, shape='image', image ="https://www.seekpng.com/png/small/846-8465978_critres-de-choix-pour-la-solution-de-cration.png")
    nt.add_node(newdst,newdst,title=newdst, shape='image', image="https://cdn-icons-png.flaticon.com/512/2083/2083417.png")
    nt.add_edge(src, newdst, value=w, title=f'Ownership: {w}%, Notified On: {date}', color='#F6E1D3')
    nt.add_edge(dst, newdst, value=w, title=f'Ownership: {w}%, Notified On: {date}', color='#F6E1D3')
        
nt.show_buttons(filter_=['physics'])
nt.show('nx.html')   

In [None]:
IFrame(src='nx.html', width=1000, height=1000)