In [11]:
import bs4,pandas,os,random,time,requests

SOAP_URL = "https://cgmix.uscg.mil/xml/PSIXData.asmx"
SOAP_HEADERS = {
    'Content-Type': 'text/xml; charset=utf-8',
}

LOAD_VESSELS = "data/source/IIR/IIR_vessels.csv"

DEFICIENCIES_SAVE_FILE = "data/source/PSIX/PSIX_deficiencies.json"
TONNAGE_SAVE_FILE = "data/source/PSIX/PSIX_tons.csv"
PARTICULARS_SAVE_FILE = "data/source/PSIX/PSIX_particulars.csv"

def find_item_in_soup(soup,item):
    return soup.find(item).text if soup.find(item) else None

def limit_to_unprocessed_records(df,savefile,key):
    if os.path.exists(savefile):
        current_savefile = pandas.read_csv(savefile)
        print("primary_vessel_id" in df.columns)
        return df[~df[key].isin(current_savefile[key])]
    else:
        return df

def save_csv(activity_id,savefile,fn):
    time.sleep(random.gauss(1,.1))
    try: 
      summary = fn(activity_id)
      df = pandas.DataFrame(summary)
      if os.path.exists(savefile):
          df.to_csv(savefile,mode="a",header=False,index=False)
      else:
          df.to_csv(savefile,mode="w",header=True,index=False)
    except:
      print(f"Failed to get data for activity {activity_id}")

def construct_deficiencies_payload(activity_id):
    return f"""<?xml version="1.0" encoding="utf-8"?>
    <soap:Envelope xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/">
    <soap:Body>
        <getVesselDeficiencies xmlns="https://cgmix.uscg.mil">
        <ActivityNumber>{activity_id}</ActivityNumber>
        </getVesselDeficiencies>
    </soap:Body>
    </soap:Envelope>"""

def construct_tonnage_payload(vessel_id):
    return f"""<?xml version="1.0" encoding="utf-8"?>
    <soap:Envelope xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/">
    <soap:Body>
        <getVesselTonnage xmlns="https://cgmix.uscg.mil">
        <VesselID>{vessel_id}</VesselID>
        </getVesselTonnage>
    </soap:Body>
    </soap:Envelope>"""

def construct_particulars_payload(vessel_id):
    return f"""<?xml version="1.0" encoding="utf-8"?>
    <soap:Envelope xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/">
    <soap:Body>
        <getVesselParticulars xmlns="https://cgmix.uscg.mil">
        <VesselID>{vessel_id}</VesselID>
        </getVesselParticulars>
    </soap:Body>
    </soap:Envelope>"""

def get_tonnage(vessel_id):
    payload = construct_tonnage_payload(vessel_id)
    response = requests.request("POST", SOAP_URL, data=payload,headers=SOAP_HEADERS)
    soup = bs4.BeautifulSoup(response.text,features="xml")
    tons = []
    for tonnage in soup.findAll("VesselTonnage"):
        measure_of_weight = find_item_in_soup(tonnage, "MeasureOfWeight")
        tonnage_type_id = find_item_in_soup(tonnage,"TonnageTypeLookupId")
        tonnage_type_name = find_item_in_soup(tonnage,"TonnageTypeLookupName")
        unit_of_measure_id = find_item_in_soup(tonnage,"TonnageUnitOfMeasurementFilterLookupId")
        unit_of_measure_name = find_item_in_soup(tonnage,"UnitOfMeasureLookupName")
        vessel_id = vessel_id
        tons.append({
            "measure_of_weight":measure_of_weight,
            "tonnage_type_id":tonnage_type_id,
            "tonnage_type_name":tonnage_type_name,
            "unit_of_measure_id":unit_of_measure_id,
            "unit_of_measure_name":unit_of_measure_name,
            "primary_vessel_id":vessel_id
        })
    return tons

def get_particulars(vessel_id):
    payload = construct_particulars_payload(vessel_id)
    response = requests.request("POST", SOAP_URL, data=payload,headers=SOAP_HEADERS)
    soup = bs4.BeautifulSoup(response.text,features="xml")
    vessel_particulars = soup.find_all("VesselParticulars")
    particulars = []
    for v in vessel_particulars:
        name = find_item_in_soup(v,"VesselName")
        call_sign = find_item_in_soup(v,"VesselCallSign")
        service_type = find_item_in_soup(v,"ServiceType")
        service_sub = find_item_in_soup(v,"ServiceSubType")
        status_lookup = find_item_in_soup(v,"StatusLookupName")
        construct_complete_year = find_item_in_soup(v,"ConstructionCompletedYear")
        out_of_service_date = find_item_in_soup(v,"OutOfServiceDate")
        identification = find_item_in_soup(v,"Identification")
        id_type_id = find_item_in_soup(v,"IdentificationTypeLookupId")
        id_type_name = find_item_in_soup(v,"IdentificationTypeLookupName")
        country = find_item_in_soup(v,"CountryLookupName")
        country_id = find_item_in_soup(v,"CountryLookupId")
        particulars.append({"primary_vessel_id":vessel_id,
                            "vessel_name":name,
                            "call_sign":call_sign,
                            "service_type":service_type,
                            "service_sub":service_sub,
                            "status_lookup":status_lookup,
                            "construct_complete_year":construct_complete_year,
                            "out_of_service_date":out_of_service_date,
                            "identification":identification,
                            "id_type_id":id_type_id,
                            "id_type_name":id_type_name,
                            "country":country,
                            "country_id":country_id})
    return particulars
    
# def get_deficiencies(activity_id):
#     payload = construct_tonnage_payload(activity_id)
#     response = requests.request("POST", SOAP_URL, data=payload,headers=SOAP_HEADERS)
#     soup = bs4.BeautifulSoup(response.text,features="xml")

vessels = pandas.read_csv(LOAD_VESSELS)
#tonnage_vessels_to_process = limit_to_unprocessed_records(vessels,TONNAGE_SAVE_FILE,"primary_vessel_id")
#tonnage_vessels_to_process["primary_vessel_id"].apply(save_csv,args=(TONNAGE_SAVE_FILE,get_tonnage))

particulars_vessels_to_process = limit_to_unprocessed_records(vessels,PARTICULARS_SAVE_FILE,"primary_vessel_id")
particulars_vessels_to_process["primary_vessel_id"].apply(save_csv,args=(PARTICULARS_SAVE_FILE,get_particulars))

Failed to get data for activity 517731
Failed to get data for activity 653127
Failed to get data for activity 512141


0       None
1       None
2       None
3       None
4       None
        ... 
9072    None
9073    None
9074    None
9075    None
9076    None
Name: primary_vessel_id, Length: 9077, dtype: object

In [7]:
vessels["primary_vessel_id"]

0         240646
1         275040
2        8912118
3       FL6036JR
4            NaN
          ...   
9072      537733
9073     1331497
9074     1119334
9075      536311
9076      560090
Name: primary_vessel_id, Length: 9077, dtype: object

In [10]:
pandas.read_csv(PARTICULARS_SAVE_FILE)

Unnamed: 0,vessel_name,call_sign,service_type,service_sub,status_lookup,construct_complete_year,out_of_service_date,identification,id_type_id,id_type_name,country,country_id
0,EPIPHANY,,Recreational,General,Active,1984,,673207,6,Official Number (U.S.),UNITED STATES,176948
1,IDLE TIME,,Recreational,General,Active,1986,,902747,6,Official Number (U.S.),UNITED STATES,176948


In [49]:
vessels = pandas.read_csv(LOAD_VESSELS)

requests.request("POST", SOAP_URL, data=construct_tonnage_payload(240646), headers=SOAP_HEADERS).text

'<?xml version="1.0" encoding="utf-8"?><soap:Envelope xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema"><soap:Body><getVesselTonnageResponse xmlns="https://cgmix.uscg.mil"><getVesselTonnageResult><xs:schema id="NewDataSet" xmlns="" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:msdata="urn:schemas-microsoft-com:xml-msdata"><xs:element name="NewDataSet" msdata:IsDataSet="true" msdata:UseCurrentLocale="true"><xs:complexType><xs:choice minOccurs="0" maxOccurs="unbounded"><xs:element name="VesselTonnage"><xs:complexType><xs:sequence><xs:element name="VesselId" type="xs:int" minOccurs="0" /><xs:element name="MeasureOfWeight" type="xs:int" minOccurs="0" /><xs:element name="TonnageTypeLookupId" type="xs:int" minOccurs="0" /><xs:element name="TonnageTypeLookupName" type="xs:string" minOccurs="0" /><xs:element name="TonnageUnitOfMeasurementFilterLookupId" type="xs:int" minOccurs="0" /><xs:

In [20]:
temp = activities.sample(100)
temp["xml"] = temp["activity_id"].apply(lambda x: requests.request("POST", SOAP_URL, data=construct_deficiencies_payload(x), headers=SOAP_HEADERS).text)
temp

Unnamed: 0,activity_id,title,start_dt,end_dt,page,xml
3155,7067287,SATURN Allision,9/28/2020,5/11/2021,316,"<?xml version=""1.0"" encoding=""utf-8""?><soap:En..."
3073,6934644,ITV JASON W. NYBERG/Vicksburg RR Bridge - Alli...,4/2/2020,5/18/2020,308,"<?xml version=""1.0"" encoding=""utf-8""?><soap:En..."
1684,3850436,UTV ISLAND PROGRESS allision w/Wappo Bridge,9/15/2010,1/27/2011,169,"<?xml version=""1.0"" encoding=""utf-8""?><soap:En..."
832,2893038,"M/V Martha Mac, Allision (Pekin RR Bridge)",3/18/2007,3/26/2009,84,"<?xml version=""1.0"" encoding=""utf-8""?><soap:En..."
2687,6294042,Barge CODY LIZ Allision w/ Bayou Boeuf RR Bridge,10/22/2017,1/10/2018,269,"<?xml version=""1.0"" encoding=""utf-8""?><soap:En..."
...,...,...,...,...,...,...
2094,4515746,GLADYS FORD / Allision,1/7/2013,1/16/2015,210,"<?xml version=""1.0"" encoding=""utf-8""?><soap:En..."
795,2868334,UTV BRIDGE ALLISION,1/30/2007,3/24/2009,80,"<?xml version=""1.0"" encoding=""utf-8""?><soap:En..."
2979,6821021,ITV JOHN VAUGHN/Bridge Allision,9/25/2019,11/12/2019,298,"<?xml version=""1.0"" encoding=""utf-8""?><soap:En..."
2153,4589867,M/V HERBERT C. JACKSON / Allision,5/12/2013,1/9/2015,216,"<?xml version=""1.0"" encoding=""utf-8""?><soap:En..."


In [21]:
temp["len_xml"] = temp["xml"].apply(len)
temp[temp["len_xml"]!=1649]

Unnamed: 0,activity_id,title,start_dt,end_dt,page,xml,len_xml


In [11]:
import bs4,pandas,math,os,random,time,requests
SOAP_URL = "https://cgmix.uscg.mil/xml/IIRData.asmx"
SOAP_HEADERS = {
    'Content-Type': 'text/xml; charset=utf-8',
}

CASUALTIES_SAVE_FILE = "data/source/IIR/IIR_casualties.json"
WATERSEG_SAVE_FILE = "data/source/IIR/IIR_water_segments.csv"
BRIEF_SAVE_FILE = "data/source/IIR/IIR_briefs.csv"
DAMAGES_SAVE_FILE = "data/source/IIR/IIR_damages.csv"


def find_item_in_soup(soup,item):
    return soup.find(item).text if soup.find(item) else None

def limit_to_unprocessed_records(df,savefile):
    if os.path.exists(savefile):
        current_savefile = pandas.read_csv(savefile)
        return df[~df["activity_id"].isin(current_savefile["activity_id"])]
    else:
        return df

def save_csv(activity_id,savefile,fn):
    time.sleep(random.gauss(1,.1))
    try: 
      summary = fn(activity_id)
      df = pandas.DataFrame([summary])
      if os.path.exists(savefile):
          df.to_csv(savefile,mode="a",header=False,index=False)
      else:
          df.to_csv(savefile,mode="w",header=True,index=False)
    except:
      print(f"Failed to get data for activity {activity_id}")

def save_json(activity_id,savefile,fn):
    time.sleep(random.gauss(1,.1))
    try: 
      summary = fn(activity_id)
      df = pandas.DataFrame([summary])
      if os.path.exists(savefile):
          df.to_json(savefile,mode="a",lines=True,orient="records")
      else:
          df.to_json(savefile,mode="w",lines=True,orient="records")
    except:
      print(f"Failed to get data for activity {activity_id}")

def construct_water_segment_payload(activity_id):
    return f"""<?xml version="1.0" encoding="utf-8"?>
        <soap:Envelope xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/">
        <soap:Body>
            <getIIRWaterSegments xmlns="https://cgmix.uscg.mil/xml/">
            <ActivityId>{activity_id}</ActivityId>
            </getIIRWaterSegments>
        </soap:Body>
        </soap:Envelope>"""

def construct_brief_payload(activity_id):
    return f"""<?xml version="1.0" encoding="utf-8"?>
    <soap:Envelope xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/">
      <soap:Body>
        <getIIRIncidentBrief xmlns="https://cgmix.uscg.mil/xml/">
          <ActivityId>{activity_id}</ActivityId>
        </getIIRIncidentBrief>
      </soap:Body>
    </soap:Envelope>"""

def construct_damages_payload(activity_id):
    return f"""<?xml version="1.0" encoding="utf-8"?>
            <soap:Envelope xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/">
            <soap:Body>
                <getIIRVesselDamageSummary xmlns="https://cgmix.uscg.mil/xml/">
                <ActivityId>{activity_id}</ActivityId>
                </getIIRVesselDamageSummary>
            </soap:Body>
            </soap:Envelope>"""

def construct_casualty_payload(activity_id):
    return f"""<?xml version="1.0" encoding="utf-8"?>
        <soap:Envelope xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/">
        <soap:Body>
            <getIIRPersonalCasualtySummary xmlns="https://cgmix.uscg.mil/xml/">
            <ActivityId>{str(activity_id)}</ActivityId>
            </getIIRPersonalCasualtySummary>
        </soap:Body>
        </soap:Envelope>"""

def get_brief(activity_id):
    payload = construct_brief_payload(activity_id)
    response = requests.request("POST", SOAP_URL, data=payload,headers=SOAP_HEADERS)
    brief = bs4.BeautifulSoup(response.text,features="xml").find("IncidentBrief").text
    return {"activity_id":activity_id,"brief":brief}

def get_waterseg(activity_id):
    payload = construct_water_segment_payload(activity_id)
    response = requests.request("POST", SOAP_URL, data=payload,headers=SOAP_HEADERS)
    soup = bs4.BeautifulSoup(response.text,features="xml")
    segments = []
    for segment in soup.find_all("IIRWaterSegments"):
        waterway_role = find_item_in_soup(segment,"WaterwayRoleLookupName")
        description = find_item_in_soup(segment,"Description")
        lat = find_item_in_soup(segment,"Latitude")
        long = find_item_in_soup(segment,"Longitude")
        waterway_name = find_item_in_soup(segment,"WaterwayName")
        seg_dict = {"activity_id":activity_id,
            "waterway_role":waterway_role,
            "description":description,
            "lat":lat,
            "long":long,
            "waterway_name":waterway_name}
        segments.append(seg_dict)
    return segments

def get_damages(activity_id):
    payload = construct_damages_payload(activity_id)
    response = requests.request("POST", SOAP_URL, data=payload,headers=SOAP_HEADERS)
    soup = bs4.BeautifulSoup(response.text,features="xml")
    cargo_damage = find_item_in_soup(soup,"CargoDamageInDollars")
    facility_damage = find_item_in_soup(soup,"FacilityDamageInDollars")
    vessel_damage = find_item_in_soup(soup,"VesselDamageInDollars")
    other_damage = find_item_in_soup(soup,"OtherDamageInDollars")
    return {"activity_id":activity_id,
            "cargo_damage":cargo_damage,
            "facility_damage":facility_damage,
            "vessel_damage":vessel_damage,
            "other_damage":other_damage}

def get_casualties(activity_id):
    payload = construct_casualty_payload(activity_id)
    response = requests.request("POST", SOAP_URL, data=payload,headers=SOAP_HEADERS)
    soup = bs4.BeautifulSoup(response.text,features="xml")
    casualty_categories = soup.find_all("IIRPersonalCasualtySummary")
    return_dict = {"activity_id":activity_id}
    for cat in casualty_categories:
        category_name = find_item_in_soup(cat,"CasualtyStatusLookupName")
        category_count = find_item_in_soup(cat,"TotalPeopleAtRisk")
        return_dict[category_name] = category_count
    return return_dict      

activities = pandas.read_csv("data/source/IIR/IIR_search.csv")

waterseg_activities_to_process = limit_to_unprocessed_records(activities,WATERSEG_SAVE_FILE)
waterseg_activities_to_process["activity_id"].apply(save_csv,args=(WATERSEG_SAVE_FILE,get_waterseg))
damages_activities_to_process = limit_to_unprocessed_records(activities,DAMAGES_SAVE_FILE)
#damages_activities_to_process["activity_id"].apply(save_csv,args=(DAMAGES_SAVE_FILE,get_damages))
casualty_activities_to_process = limit_to_unprocessed_records(activities,CASUALTIES_SAVE_FILE)
#casualty_activities_to_process["activity_id"].apply(save_json,args=(CASUALTIES_SAVE_FILE,get_casualties))
brief_activities_to_process = limit_to_unprocessed_records(activities,BRIEF_SAVE_FILE)
#brief_activities_to_process["activity_id"].apply(save_json,args=(BRIEF_SAVE_FILE,get_brief))


Failed to get data for activity 2596680
Failed to get data for activity 2724374
Failed to get data for activity 3941491
Failed to get data for activity 6623967


0       None
1       None
2       None
3       None
4       None
        ... 
3627    None
3628    None
3629    None
3630    None
3631    None
Name: activity_id, Length: 3632, dtype: object

In [41]:
payload = construct_damages_payload(9656)
response = requests.request("POST", SOAP_URL, data=payload,headers=SOAP_HEADERS)
soup = bs4.BeautifulSoup(response.text,features="xml")
soup

<?xml version="1.0" encoding="utf-8"?>
<soap:Envelope xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><soap:Body><getIIRVesselDamageSummaryResponse xmlns="https://cgmix.uscg.mil/xml/"><getIIRVesselDamageSummaryResult><xs:schema id="NewDataSet" xmlns="" xmlns:msdata="urn:schemas-microsoft-com:xml-msdata" xmlns:xs="http://www.w3.org/2001/XMLSchema"><xs:element msdata:IsDataSet="true" msdata:UseCurrentLocale="true" name="NewDataSet"><xs:complexType><xs:choice maxOccurs="unbounded" minOccurs="0"><xs:element name="IIRVesselDamageSummary"><xs:complexType><xs:sequence><xs:element minOccurs="0" name="CargoDamageInDollars" type="xs:long"/><xs:element minOccurs="0" name="FacilityDamageInDollars" type="xs:long"/><xs:element minOccurs="0" name="VesselDamageInDollars" type="xs:long"/><xs:element minOccurs="0" name="OtherDamageInDollars" type="xs:long"/></xs:sequence></xs:complexType></xs:elemen

In [14]:
def construct_facilities_payload(activity_id):
    return f"""<?xml version="1.0" encoding="utf-8"?>
            <soap:Envelope xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/">
            <soap:Body>
                <getIIRInvolvedFacilities xmlns="https://cgmix.uscg.mil/xml/">
                <ActivityId>{activity_id}</ActivityId>
                </getIIRInvolvedFacilities>
            </soap:Body>
            </soap:Envelope>"""

payload = construct_facilities_payload(7865593)
response = requests.request("POST", SOAP_URL, data=payload,headers=SOAP_HEADERS)
soup = bs4.BeautifulSoup(response.text,features="xml")
soup

<?xml version="1.0" encoding="utf-8"?>
<soap:Envelope xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><soap:Body><getIIRInvolvedFacilitiesResponse xmlns="https://cgmix.uscg.mil/xml/"><getIIRInvolvedFacilitiesResult><xs:schema id="NewDataSet" xmlns="" xmlns:msdata="urn:schemas-microsoft-com:xml-msdata" xmlns:xs="http://www.w3.org/2001/XMLSchema"><xs:element msdata:IsDataSet="true" msdata:UseCurrentLocale="true" name="NewDataSet"><xs:complexType><xs:choice maxOccurs="unbounded" minOccurs="0"><xs:element name="IIRInvolvedFacilities"><xs:complexType><xs:sequence><xs:element minOccurs="0" name="Name" type="xs:string"/><xs:element minOccurs="0" name="TypeLookupName" type="xs:string"/></xs:sequence></xs:complexType></xs:element></xs:choice></xs:complexType></xs:element></xs:schema><diffgr:diffgram xmlns:diffgr="urn:schemas-microsoft-com:xml-diffgram-v1" xmlns:msdata="urn:schemas-microsoft