# World Bank Database

In [10]:
import pymysql
import re
import pandas as pd
import numpy as np

import src
import env

# Acquire

In [2]:
df = src.get_contract_data()
df.head()

Unnamed: 0,As of Date,Fiscal Year,Region,Borrower Country,Borrower Country Code,Project ID,Project Name,Procurement Type,Procurement Category,Procurement Method,...,WB Contract Number,Contract Description,Contract Signing Date,Supplier,Supplier Country,Supplier Country Code,Supplier State,Total Contract Amount (USD),Borrower Contract Reference Number,UN Supplier Flag
0,3/11/2021 0:00,2000,AFE,Angola,AO,P000044,FINANCIAL INSTITUTIO,Implementation Activity,CONSULTANT SERVICES,Quality And Cost-Based Selection,...,1207736,OIL SECTOR STUDY - CONTRCT WITH KPMG,11/20/2000 0:00,KPMG INTERNATIONAL,United Kingdom,GB,Not assigned,800000,KPMG - 11/20/2000,No
1,3/11/2021 0:00,2000,AFE,Angola,AO,P000044,FINANCIAL INSTITUTIO,Implementation Activity,CONSULTANT SERVICES,Quality And Cost-Based Selection,...,1207736,OIL SECTOR STUDY - CONTRCT WITH KPMG,11/20/2000 0:00,KPMG INTERNATIONAL,United Kingdom,GB,Not assigned,800000,KPMG - 11/20/2000,No
2,3/11/2021 0:00,2000,AFE,Madagascar,MG,P052186,MG-Microfinance,Implementation Activity,CONSULTANT SERVICES,Quality And Cost-Based Selection,...,1116602,APPUI AU DEVELOPPEMENT DU RESEAU OTIV (TOAMASINA),9/17/1999 0:00,DEVELOPMENT INTERNA.DESJARDINS,Canada,CA,Not assigned,1964922,1/01/10/99/AGEPMF/DID,No
3,3/11/2021 0:00,2000,AFE,Madagascar,MG,P052186,MG-Microfinance,Implementation Activity,CONSULTANT SERVICES,Quality And Cost-Based Selection,...,1116602,APPUI AU DEVELOPPEMENT DU RESEAU OTIV (TOAMASINA),9/17/1999 0:00,DEVELOPMENT INTERNA.DESJARDINS,Canada,CA,Not assigned,1964922,1/01/10/99/AGEPMF/DID,No
4,3/11/2021 0:00,2000,AFE,South Africa,ZA,P035923,ZA-GEF Cape Penninsula SIL (FY98),Implementation Activity,CONSULTANT SERVICES,Quality And Cost-Based Selection,...,1116066,"INSTITUTIONAL, LEGAL, POLICY, FINANCIAL, SOCIA...",7/6/1999 0:00,CSIR,South Africa,ZA,Not assigned,178602,WWF-SA-3,No


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246724 entries, 0 to 246723
Data columns (total 22 columns):
 #   Column                              Non-Null Count   Dtype 
---  ------                              --------------   ----- 
 0   As of Date                          246724 non-null  object
 1   Fiscal Year                         246724 non-null  int64 
 2   Region                              246724 non-null  object
 3   Borrower Country                    246724 non-null  object
 4   Borrower Country Code               246714 non-null  object
 5   Project ID                          246724 non-null  object
 6   Project Name                        246724 non-null  object
 7   Procurement Type                    246724 non-null  object
 8   Procurement Category                246724 non-null  object
 9   Procurement Method                  246724 non-null  object
 10  Product line                        246724 non-null  object
 11  Major Sector                        246

# Prepare

## Connect to MySQL Database

In [4]:
cnx = pymysql.connect(user=env.user, password=env.password, host='localhost', port=3306)
thread = src.get_connection(env.user, env.password)

cursor = cnx.cursor()

cursor.execute("CREATE DATABASE IF NOT EXISTS worldbank;")
cursor.execute("USE worldbank;")

0

## Commit to MySQL Database

In [5]:
# Write 5 rows of data to the new database to test the connection.
df.head().to_sql(name='test',
                 con = thread,
                 if_exists='replace',
                 index=False)

## Load data from MySQL Database

In [6]:
# Read data from the table.
pd.read_sql("""SELECT * FROM test;""", thread)

Unnamed: 0,As of Date,Fiscal Year,Region,Borrower Country,Borrower Country Code,Project ID,Project Name,Procurement Type,Procurement Category,Procurement Method,...,WB Contract Number,Contract Description,Contract Signing Date,Supplier,Supplier Country,Supplier Country Code,Supplier State,Total Contract Amount (USD),Borrower Contract Reference Number,UN Supplier Flag
0,3/11/2021 0:00,2000,AFE,Angola,AO,P000044,FINANCIAL INSTITUTIO,Implementation Activity,CONSULTANT SERVICES,Quality And Cost-Based Selection,...,1207736,OIL SECTOR STUDY - CONTRCT WITH KPMG,11/20/2000 0:00,KPMG INTERNATIONAL,United Kingdom,GB,Not assigned,800000,KPMG - 11/20/2000,No
1,3/11/2021 0:00,2000,AFE,Angola,AO,P000044,FINANCIAL INSTITUTIO,Implementation Activity,CONSULTANT SERVICES,Quality And Cost-Based Selection,...,1207736,OIL SECTOR STUDY - CONTRCT WITH KPMG,11/20/2000 0:00,KPMG INTERNATIONAL,United Kingdom,GB,Not assigned,800000,KPMG - 11/20/2000,No
2,3/11/2021 0:00,2000,AFE,Madagascar,MG,P052186,MG-Microfinance,Implementation Activity,CONSULTANT SERVICES,Quality And Cost-Based Selection,...,1116602,APPUI AU DEVELOPPEMENT DU RESEAU OTIV (TOAMASINA),9/17/1999 0:00,DEVELOPMENT INTERNA.DESJARDINS,Canada,CA,Not assigned,1964922,1/01/10/99/AGEPMF/DID,No
3,3/11/2021 0:00,2000,AFE,Madagascar,MG,P052186,MG-Microfinance,Implementation Activity,CONSULTANT SERVICES,Quality And Cost-Based Selection,...,1116602,APPUI AU DEVELOPPEMENT DU RESEAU OTIV (TOAMASINA),9/17/1999 0:00,DEVELOPMENT INTERNA.DESJARDINS,Canada,CA,Not assigned,1964922,1/01/10/99/AGEPMF/DID,No
4,3/11/2021 0:00,2000,AFE,South Africa,ZA,P035923,ZA-GEF Cape Penninsula SIL (FY98),Implementation Activity,CONSULTANT SERVICES,Quality And Cost-Based Selection,...,1116066,"INSTITUTIONAL, LEGAL, POLICY, FINANCIAL, SOCIA...",7/6/1999 0:00,CSIR,South Africa,ZA,Not assigned,178602,WWF-SA-3,No


## Database Prep
- Clean column names
- Drop columns
- Create seperate tables

In [9]:
cols = list(df.columns)
print(cols)

['As of Date', 'Fiscal Year', 'Region', 'Borrower Country', 'Borrower Country Code', 'Project ID', 'Project Name', 'Procurement Type', 'Procurement Category', 'Procurement Method', 'Product line', 'Major Sector', 'WB Contract Number', 'Contract Description', 'Contract Signing Date', 'Supplier', 'Supplier Country', 'Supplier Country Code', 'Supplier State', 'Total Contract Amount (USD)', 'Borrower Contract Reference Number', 'UN Supplier Flag']


In [17]:
[col.replace(' ', '') for col in cols]

['AsofDate',
 'FiscalYear',
 'Region',
 'BorrowerCountry',
 'BorrowerCountryCode',
 'ProjectID',
 'ProjectName',
 'ProcurementType',
 'ProcurementCategory',
 'ProcurementMethod',
 'Productline',
 'MajorSector',
 'WBContractNumber',
 'ContractDescription',
 'ContractSigningDate',
 'Supplier',
 'SupplierCountry',
 'SupplierCountryCode',
 'SupplierState',
 'TotalContractAmount(USD)',
 'BorrowerContractReferenceNumber',
 'UNSupplierFlag']

In [18]:
df.columns = [col.replace(' ', '') for col in cols]

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246724 entries, 0 to 246723
Data columns (total 22 columns):
 #   Column                           Non-Null Count   Dtype 
---  ------                           --------------   ----- 
 0   AsofDate                         246724 non-null  object
 1   FiscalYear                       246724 non-null  int64 
 2   Region                           246724 non-null  object
 3   BorrowerCountry                  246724 non-null  object
 4   BorrowerCountryCode              246714 non-null  object
 5   ProjectID                        246724 non-null  object
 6   ProjectName                      246724 non-null  object
 7   ProcurementType                  246724 non-null  object
 8   ProcurementCategory              246724 non-null  object
 9   ProcurementMethod                246724 non-null  object
 10  Productline                      246724 non-null  object
 11  MajorSector                      246724 non-null  object
 12  WBContractNumber

In [19]:
df.describe()

Unnamed: 0,FiscalYear,WBContractNumber,TotalContractAmount(USD)
count,246724.0,246724.0,246724.0
mean,2007.633372,1315400.0,997496.0
std,5.62728,129722.8,8679100.0
min,2000.0,1100552.0,-143.0
25%,2002.0,1225155.0,28225.0
50%,2007.0,1268570.0,95827.5
75%,2012.0,1325667.0,355224.2
max,2021.0,1648433.0,1956748000.0


In [25]:
df.describe(include='O').T.sort_values(by=['count', 'freq'], ascending=False)

Unnamed: 0,count,unique,top,freq
AsofDate,246724,1,3/11/2021 0:00,246724
UNSupplierFlag,246724,2,No,244945
SupplierState,246724,51,Not assigned,241333
Productline,246724,18,IBRD/IDA,217662
ProcurementCategory,246724,4,CONSULTANT SERVICES,123814
MajorSector,246724,21,Public Admin,55922
ProcurementMethod,246724,30,International Competitive Bidding,53797
Region,246724,9,LCR,49902
ProcurementType,246724,68,Management /Technical Advice,27525
SupplierCountry,246724,213,India,14040


In [30]:
df.drop(columns='AsofDate', inplace=True)

In [35]:
df.ContractSigningDate = pd.to_datetime(df.ContractSigningDate)

In [39]:
df.UNSupplierFlag = np.where(df.UNSupplierFlag == 'No', False, True)

In [43]:
df.BorrowerContractReferenceNumber.value_counts().nlargest(20)

#                                   8390
UNKNOWN                              304
NONE                                 165
1                                    110
CONSULTING SERVICES                   91
AF MHSIP-7819-ME-CS-IC-SSS-10- B      88
RS-DILS-7510YF-CS-IC-09-C.2.5.9.      81
BIRF-7969-PE                          72
2                                     65
PIU CONSULTANTS                       61
WBR NO. 4                             54
WBR NO. 5                             52
WBR NO. 1                             50
WBR NO. 16                            49
3                                     49
WBR NO. 22                            48
PROCUREMENT SPECIALIST                48
WBR NO. 15                            46
WBR NO. 17                            46
SRB-PARIP-4071YF-IC-CS-07-17-DB-      45
Name: BorrowerContractReferenceNumber, dtype: int64

In [44]:
df[df.BorrowerContractReferenceNumber == '#']

Unnamed: 0,FiscalYear,Region,BorrowerCountry,BorrowerCountryCode,ProjectID,ProjectName,ProcurementType,ProcurementCategory,ProcurementMethod,Productline,...,WBContractNumber,ContractDescription,ContractSigningDate,Supplier,SupplierCountry,SupplierCountryCode,SupplierState,TotalContractAmount(USD),BorrowerContractReferenceNumber,UNSupplierFlag
4751,2000,AFE,Mozambique,MZ,P001797,CAPACITY BUILDING HUMAN DEV. PROJECT,Implementation Activity,CONSULTANT SERVICES,Quality And Cost-Based Selection,IBRD/IDA,...,1117942,TWINNING ARRANGEMENT - ECONOMICS FACULTY AT UE...,NaT,INSTITUTO SUPERIOR TECNICO,Portugal,PT,Not assigned,1500000,#,False
72662,2003,EAP,Vietnam,VN,P052037,VN-HCMC ENVMTL SANIT.,Construction Supervision,CONSULTANT SERVICES,Quality And Cost-Based Selection,IBRD/IDA,...,1234270,Package 2: CONSTRUCTION MANAGEMENT FOR NHIEU L...,2003-08-15,CAMP DRESSER AND MCKEE INTERNATIONAL INC.,United States,US,Massachusetts,14381140,#,False
94178,2005,LCR,Brazil,BR,P094715,BR GEF National Biod Mainstreaming,Miscellaneous,GOODS,National Competitive Bidding,GEF,...,1292076,"Provide airtickets to the Project, which inclu...",2004-08-10,SOLID VIAGENS E TURISMO,Brazil,BR,Not assigned,720131,#,False
103773,2005,AFE,Kenya,KE,P083250,Financial & Legal Sec TA,Project Management,CONSULTANT SERVICES,Consultant Qualification Selection,IBRD/IDA,...,1292834,To Assist the Financing of the Government's Mi...,2006-12-18,BUSINESS PLAN INTERNATIONAL (KENYA),Kenya,KE,Not assigned,2500000,#,False
104328,2005,AFE,Mozambique,MZ,P086169,MZ-Financial Sector TA Project,Management /Technical Advice,CONSULTANT SERVICES,Individual Consultant Selection,IBRD/IDA,...,1312715,Executive Secretary,2005-05-03,JULIA TEMBE,Mozambique,MZ,Not assigned,87764,#,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
237805,2018,MNA,"Egypt, Arab Rep",EG,P113416,EG-Wind Power Development,"Equipment, Electrical",GOODS,International Competitive Bidding,IBRD/IDA,...,1561888,Supplying of thermal conductors with all acces...,2018-05-06,MIDAL CABLES LIMITED,Bahrain,BH,Not assigned,10106675,#,False
238137,2018,MNA,"Egypt, Arab Rep",EG,P113416,EG-Wind Power Development,"Equipment, Electrical",GOODS,International Competitive Bidding,IBRD/IDA,...,1561854,"Design, manufacture, deliver, install, complet...",2018-05-10,LARSEN & TOUBRO LIMITED,India,IN,Not assigned,19898970,#,False
238174,2018,MNA,"Egypt, Arab Rep",EG,P100047,EG-Ain Sokhna Power,"Equipment, Electrical",GOODS,International Competitive Bidding,IBRD/IDA,...,1558342,"engineering, designing, manufacture, factory t...",2018-02-08,LARSEN & TOUBRO LIMITED,India,IN,Not assigned,28787916,#,False
238405,2018,LCR,Haiti,HT,P127203,HT Rebuilding Energy Infrastr & Access,Construction Supervision,CONSULTANT SERVICES,Quality And Cost-Based Selection,IBRD/IDA,...,1557133,Consultant pour la mission de rehabilitation d...,2018-03-28,GROUPEMENT ZECO-ECCOMAR,Italy,IT,Not assigned,7939259,#,False


In [None]:
CREATE TABLE `test` (
  `As of Date` text,
  `Fiscal Year` bigint DEFAULT NULL,
  `Region` text,
  `Borrower Country` text,
  `Borrower Country Code` text,
  `Project ID` text,
  `Project Name` text,
  `Procurement Type` text,
  `Procurement Category` text,
  `Procurement Method` text,
  `Product line` text,
  `Major Sector` text,
  `WB Contract Number` bigint DEFAULT NULL,
  `Contract Description` text,
  `Contract Signing Date` text,
  `Supplier` text,
  `Supplier Country` text,
  `Supplier Country Code` text,
  `Supplier State` text,
  `Total Contract Amount (USD)` bigint DEFAULT NULL,
  `Borrower Contract Reference Number` text,
  `UN Supplier Flag` text
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci