# Contract Analysis

## Data Download

We use Cook County of Illinois Procurement - Awarded Contracts & Amendments dataset, located [here](https://catalog.data.gov/dataset/procurement-awarded-contracts-amendments). A local copy of the csv file is saved in this repository.

We focus solely on Environmental Services to get a realistic amount of contracts with ammendments (25 total contracts with numerous ammendments).

In [0]:
import pandas as pd
df = pd.read_parquet("cook_county_contracts.parquet")

In [0]:
env_df = df[df["Commodity Type"] == "Environmental Services"]
mining_df = df[df["Commodity Type"] == "Mining and Well Drilling Machinery and Accessories"]
display(env_df)

In [0]:
dbutils.widgets.text("catalog", "shm", "Catalog")
dbutils.widgets.text("schema", "contract", "Schema")
dbutils.widgets.text("volume_name", "raw", "Volume Name")
catalog = dbutils.widgets.get("catalog")
schema = dbutils.widgets.get("schema")
volume = dbutils.widgets.get("volume_name")

In [0]:
import re
import os
from pathlib import Path

base_output_dir = f"/Volumes/{catalog}/{schema}/{volume}"
Path(base_output_dir).mkdir(parents=True, exist_ok=True)

downloaded = failed = 0

for _, row in env_df[['Category', 'Vendor Name']].dropna(subset=['Category', 'Vendor Name']).iterrows():
    category = row['Category']
    vendor = re.sub(r'[^A-Z ]', '', str(row['Vendor Name']).upper())
    match = re.search(r'(https?://[^\s\)]+)', str(category))
    if match:
        url = match.group(1)
        filename = url.split("/")[-1]
        vendor_dir = os.path.join(base_output_dir, vendor)
        Path(vendor_dir).mkdir(parents=True, exist_ok=True)
        output_path = os.path.join(vendor_dir, filename)
        if os.system(f'wget -q -O "{output_path}" "{url}"') == 0:
            downloaded += 1
        else:
            failed += 1

print(f"\nSummary: {downloaded} files downloaded, {failed} failed")

In [0]:
downloaded = failed = 0
# We will change this to true
load_mining_data = False
if load_mining_data:
  for _, row in mining_df[['Category', 'Vendor Name']].dropna(subset=['Category', 'Vendor Name']).iterrows():
      category = row['Category']
      vendor = re.sub(r'[^A-Z ]', '', str(row['Vendor Name']).upper())
      match = re.search(r'(https?://[^\s\)]+)', str(category))
      if match:
          url = match.group(1)
          filename = url.split("/")[-1]
          vendor_dir = os.path.join(base_output_dir, vendor)
          Path(vendor_dir).mkdir(parents=True, exist_ok=True)
          output_path = os.path.join(vendor_dir, filename)
          if os.system(f'wget -q -O "{output_path}" "{url}"') == 0:
              downloaded += 1
          else:
              failed += 1

  print(f"\nSummary: {downloaded} files downloaded, {failed} failed")