# Data retrieval from MUL site for Mech Assist application
This code retrieves the whole list of units from http://masterunitlist.info/ and their Alpha Strike parameters.

Note: code was done with chatGPT asistance

In [1]:
import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime
import pandas as pd
from tqdm.auto import tqdm

Define the URL 

In [2]:
# Define the URL for ALL units (mechs, vehicles, infantry, aerospace, buildings, etc)
search_url = "http://www.masterunitlist.info/Unit/Filter?Types=18&Types=19&Types=17&Types=21&Types=20&Types=23&Types=24&Types=81&Types=79&Types=97&Types=76"

Connect and get confirmation of connection

In [3]:
# Send an HTTP GET request to the URL
response = requests.get(search_url)
if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')
    print("Access granted!")
else:
    print("Failed to retrieve the webpage.")

Access granted!


Make a list of all mechs

In [4]:
# Find all units
unit_names = [
    a.text for a in soup.find_all('a', href=True) if a['href'].startswith("/Unit/Details/")
]

Make a dataframe to collect units parameters

In [5]:
dataset = pd.DataFrame(columns = ["Name", "Class", "Variant", "Role", "PV", "Type", "Size", "Move",\
                  "Short", "ShortMin", "Medium", "MediumMin", "Long", "LongMin", "Extreme", "ExtremeMin", "Threshold",\
                  "Overheat", "Armor", "Structure", "Specials", "ImageURL"])

Fill `dataset` with units' information:

In [6]:
MECH_TYPES = ["BM", "IM", "PM"]
AEROSPACE_TYPES = ["AF", "CF"]

for i, unit_name in enumerate(unit_names):
    unit_trail = unit_name.replace("+","%2b")
    if (i%50 == 0):
        percent_complete = (i + 1) / len(unit_names) * 100
        print(f"Parsing {i + 1} of {len(unit_names)} units - {percent_complete:.2f}% complete")

    # Define the URL with the query parameters
    url = f"https://masterunitlist.azurewebsites.net/Unit/QuickList?Name={unit_trail}"
    
    # Send an HTTP GET request with the parameters
    response = requests.get(url, stream=True)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:

        # Parse the JSON response
        data = response.json()

        # Identify which unit should be parsed as one request may return several units. For example:
        # https://masterunitlist.azurewebsites.net/Unit/QuickList?Name=Phoenix%20Hawk%20PXH-1k

        for item in data.get("Units"):
            if (item.get("Name")==unit_name):
                index = data.get("Units").index(item)

        # Extract the desired information
        parsed_unit = {
            "Name": data.get("Units")[index].get("Name"),
            "Class": data.get("Units")[index].get("Class"),
            "Variant": data.get("Units")[index].get("Variant"),
            "Role": data.get("Units")[index].get("Role").get("Name"),
            "PV": data.get("Units")[index].get("BFPointValue", 0),
            "Type": data.get("Units")[index].get("BFType", ""),
            "Size": data.get("Units")[index].get("BFSize", 0),
            "Move": data.get("Units")[index].get("BFMove", ""),
            "Short": data.get("Units")[index].get("BFDamageShort", 0),
            "ShortMin": data.get("Units")[index].get("BFDamageShortMin", 0),
            "Medium": data.get("Units")[index].get("BFDamageMedium", 0),
            "MediumMin": data.get("Units")[index].get("BFDamageMediumMin", 0),
            "Long": data.get("Units")[index].get("BFDamageLong", 0),
            "LongMin": data.get("Units")[index].get("BFDamageLongMin", 0),
            "Armor": data.get("Units")[index].get("BFArmor", 0),
            "Structure": data.get("Units")[index].get("BFStructure", 0),
            "Specials": data.get("Units")[index].get("BFAbilities", ""),
            "ImageURL": data.get("Units")[index].get("ImageUrl", "")
        }
        if parsed_unit["Type"] in MECH_TYPES:
            parsed_unit["Overheat"] = data.get("Units")[index].get("BFOverheat", 0)
        else:
            parsed_unit["Overheat"] = 0

        #REGARDING AEROSPACE: note the inconsistent errors on the API side for "Exteme"
        if parsed_unit["Type"] in AEROSPACE_TYPES:
            parsed_unit["Extreme"] = data.get("Units")[index].get("BFDamageExtreme", 0)
            parsed_unit["ExtremeMin"] = data.get("Units")[index].get("BFDamageExtemeMin", 0)
            parsed_unit["Threshold"] = data.get("Units")[index].get("BFThreshold", 0)
        else:
            parsed_unit["Extreme"] = 0
            parsed_unit["ExtremeMin"] = False
            parsed_unit["Threshold"] = 0
        
        # Add unit parameters to the dataframe
        dataset = pd.concat([dataset, pd.DataFrame([parsed_unit])], ignore_index=True)

print(f"Data has been saved to 'datase' DataFrame")

Parsing 1 of 8710 units - 0.01% complete
Parsing 51 of 8710 units - 0.59% complete
Parsing 101 of 8710 units - 1.16% complete
Parsing 151 of 8710 units - 1.73% complete
Parsing 201 of 8710 units - 2.31% complete
Parsing 251 of 8710 units - 2.88% complete
Parsing 301 of 8710 units - 3.46% complete
Parsing 351 of 8710 units - 4.03% complete
Parsing 401 of 8710 units - 4.60% complete
Parsing 451 of 8710 units - 5.18% complete
Parsing 501 of 8710 units - 5.75% complete
Parsing 551 of 8710 units - 6.33% complete
Parsing 601 of 8710 units - 6.90% complete
Parsing 651 of 8710 units - 7.47% complete
Parsing 701 of 8710 units - 8.05% complete
Parsing 751 of 8710 units - 8.62% complete
Parsing 801 of 8710 units - 9.20% complete
Parsing 851 of 8710 units - 9.77% complete
Parsing 901 of 8710 units - 10.34% complete
Parsing 951 of 8710 units - 10.92% complete
Parsing 1001 of 8710 units - 11.49% complete
Parsing 1051 of 8710 units - 12.07% complete
Parsing 1101 of 8710 units - 12.64% complete
Parsin

Dataset overview

In [7]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8710 entries, 0 to 8709
Data columns (total 22 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Name        8710 non-null   object
 1   Class       8710 non-null   object
 2   Variant     8645 non-null   object
 3   Role        8693 non-null   object
 4   PV          8710 non-null   object
 5   Type        7649 non-null   object
 6   Size        8710 non-null   object
 7   Move        8710 non-null   object
 8   Short       8710 non-null   object
 9   ShortMin    8710 non-null   object
 10  Medium      8710 non-null   object
 11  MediumMin   8710 non-null   object
 12  Long        8710 non-null   object
 13  LongMin     8710 non-null   object
 14  Extreme     8710 non-null   object
 15  ExtremeMin  8710 non-null   object
 16  Threshold   8710 non-null   object
 17  Overheat    8710 non-null   object
 18  Armor       8710 non-null   object
 19  Structure   8710 non-null   object
 20  Specials

In [8]:
dataset.nunique()

Name          8709
Class         2132
Variant       4733
Role            15
PV              86
Type            18
Size             6
Move           200
Short           16
ShortMin         2
Medium          12
MediumMin        2
Long            10
LongMin          2
Extreme          6
ExtremeMin       2
Threshold        6
Overheat         5
Armor           24
Structure       16
Specials      3587
ImageURL      2009
dtype: int64

Clear the dataset and make a new one (to have an access to initial data).
All the units with 0 PV can be dropped as they shouldn't be used during a game

In [9]:
unitlist = dataset[dataset['PV'] != 0].reset_index(drop=True)
unitlist.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7654 entries, 0 to 7653
Data columns (total 22 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Name        7654 non-null   object
 1   Class       7654 non-null   object
 2   Variant     7611 non-null   object
 3   Role        7654 non-null   object
 4   PV          7654 non-null   object
 5   Type        7649 non-null   object
 6   Size        7654 non-null   object
 7   Move        7654 non-null   object
 8   Short       7654 non-null   object
 9   ShortMin    7654 non-null   object
 10  Medium      7654 non-null   object
 11  MediumMin   7654 non-null   object
 12  Long        7654 non-null   object
 13  LongMin     7654 non-null   object
 14  Extreme     7654 non-null   object
 15  ExtremeMin  7654 non-null   object
 16  Threshold   7654 non-null   object
 17  Overheat    7654 non-null   object
 18  Armor       7654 non-null   object
 19  Structure   7654 non-null   object
 20  Specials

Check all NA values and replace them if applicable

In [10]:
unitlist[unitlist["Variant"].isna()].head()

Unnamed: 0,Name,Class,Variant,Role,PV,Type,Size,Move,Short,ShortMin,...,Long,LongMin,Extreme,ExtremeMin,Threshold,Overheat,Armor,Structure,Specials,ImageURL
274,Koshi (Standard),Koshi (Standard),,Striker,30,BM,1,"14""/12""j",3,False,...,0,False,0,False,0,0,2,1,"CASE,JMPW1,LTAG,PRB,RCN,SRM2/2",https://i.ibb.co/4p5Q7X3/koshi-3150.png
482,Pack Hunter II,Pack Hunter II,,Striker,34,BM,1,"14""j",3,False,...,2,False,0,False,0,0,3,2,CASE,https://i.ibb.co/9g5VR4J/pack-hunter-ii-3085.png
616,Cougar-XR,Cougar-XR,,Missile Boat,36,BM,1,"10""/14""j",3,False,...,2,False,0,False,0,0,4,2,"CASE,IF1,JMPS1,RFA",https://i.ibb.co/G9JBR33/cougar-xr.png
617,Eyrie,Eyrie,,Striker,35,BM,1,"12""/14""j",4,False,...,0,True,0,False,0,0,4,2,"CASE,JMPS1,MEL",https://i.ibb.co/C1Gs4sr/eyrie-3145.png
655,Jaguar,Jaguar,,Striker,42,BM,1,"16""",4,False,...,1,False,0,False,0,1,4,2,"CASE,TUR(2/1/1)",https://i.ibb.co/0BTn75T/jaguar-3150.png


Model can be None

In [11]:
unitlist[unitlist["Specials"].isna()].head()

Unnamed: 0,Name,Class,Variant,Role,PV,Type,Size,Move,Short,ShortMin,...,Long,LongMin,Extreme,ExtremeMin,Threshold,Overheat,Armor,Structure,Specials,ImageURL
13,Prey Seeker PY-SR30,Prey Seeker,PY-SR30,Scout,17,BM,1,"24""",1,False,...,0,False,0,False,0,0,2,1,,https://i.ibb.co/WkLHSR4/prey-seeker-3150.png
27,Cossack C-SK1,Cossack,C-SK1,Striker,17,BM,1,"12""j",2,False,...,0,False,0,False,0,0,2,1,,https://i.ibb.co/jhmq1pX/cossack-3060.png
46,Fireball ALM-7D,Fireball,ALM-7D,Scout,17,BM,1,"22""",1,False,...,0,False,0,False,0,0,2,1,,https://i.ibb.co/p0KNM8N/fireball-3055u.png
47,Fireball ALM-8D,Fireball,ALM-8D,Scout,20,BM,1,"22""",2,False,...,0,False,0,False,0,0,2,1,,https://i.ibb.co/p0KNM8N/fireball-3055u.png
48,Fireball ALM-9D,Fireball,ALM-9D,Scout,19,BM,1,"22""",1,False,...,0,False,0,False,0,0,2,1,,https://i.ibb.co/p0KNM8N/fireball-3055u.png


Specials can be None

Add information for each unit about its era availability

In [12]:
# Define the list of eras
eras = [
    {"Name": "Star League (2571 - 2780)", "ID": "star-league"},
    {"Name": "Early Succession War (2781 - 2900)", "ID": "early-succession-war"},
    {"Name": "Late Succession War - LosTech (2901 - 3019)", "ID": "late-succession-war---lostech"},
    {"Name": "Late Succession War - Renaissance (3020 - 3049)", "ID": "late-succession-war---renaissance"},
    {"Name": "Clan Invasion (3050 - 3061)", "ID": "clan-invasion"},
    {"Name": "Civil War (3062 - 3067)", "ID": "civil-war"},
    {"Name": "Jihad (3068 - 3085)", "ID": "jihad"},
    {"Name": "Early Republic (3086 - 3100)", "ID": "early-republic"},
    {"Name": "Late Republic (3101 - 3130)", "ID": "late-republic"},
    {"Name": "Dark Ages (3131 - 3150)", "ID": "dark-age"},
    {"Name": "ilClan (3151 - 9999)", "ID": "ilclan"}
]

# Create era availabilty dataset to fill later
era_av = pd.DataFrame(columns=[era["Name"] for era in eras])

Fill `era_av` with units' information

In [13]:
# Define the URL for all battlemechs
print("start")
search_url = "http://www.masterunitlist.info/Unit/Filter?Types=18&Types=19&Types=17&Types=21&Types=20&Types=23&Types=24&Types=81&Types=79&Types=97&Types=76"

# Send an HTTP GET request to the URL
response = requests.get(search_url)

# Check if the request was successful
if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all unit URLs
    unit_urls = [a['href'] for a in soup.find_all('a', href=True) if a['href'].startswith("/Unit/Details/")]
    
    for i, unit_url in enumerate(unit_urls):
        if i % 50 == 0:
            percent_complete = (i + 1) / len(unit_urls) * 100
            print(f"Parsing {i + 1} of {len(unit_urls)} units - {percent_complete:.2f}% complete")

        
        unit_details_url = f"http://www.masterunitlist.info{unit_url}"
        unit_response = requests.get(unit_details_url)
        unit_soup = BeautifulSoup(unit_response.text, 'html.parser')

        unit_era = {'Name' : f'{unit_soup.find("h2").get_text().strip()}'}


        # Make a dict for each unit
        for era in eras:
            faction_era_element = unit_soup.find(id=era["ID"])
            if faction_era_element != None:
                factions = [a.get_text().strip() for a in faction_era_element.find_all("a")]
                unit_era[f"{era['Name']}"] = ", ".join(factions)
            else:
                unit_era[f"{era['Name']}"] = "Unknown"


        # Add unit eras to the dataframe
        era_av = pd.concat([era_av, pd.DataFrame([unit_era])], ignore_index=True)
        
        
print(f"Data has been saved to 'era_av' dataset")



start
Parsing 1 of 8710 units - 0.01% complete
Parsing 51 of 8710 units - 0.59% complete
Parsing 101 of 8710 units - 1.16% complete
Parsing 151 of 8710 units - 1.73% complete
Parsing 201 of 8710 units - 2.31% complete
Parsing 251 of 8710 units - 2.88% complete
Parsing 301 of 8710 units - 3.46% complete
Parsing 351 of 8710 units - 4.03% complete
Parsing 401 of 8710 units - 4.60% complete
Parsing 451 of 8710 units - 5.18% complete
Parsing 501 of 8710 units - 5.75% complete
Parsing 551 of 8710 units - 6.33% complete
Parsing 601 of 8710 units - 6.90% complete
Parsing 651 of 8710 units - 7.47% complete
Parsing 701 of 8710 units - 8.05% complete
Parsing 751 of 8710 units - 8.62% complete
Parsing 801 of 8710 units - 9.20% complete
Parsing 851 of 8710 units - 9.77% complete
Parsing 901 of 8710 units - 10.34% complete
Parsing 951 of 8710 units - 10.92% complete
Parsing 1001 of 8710 units - 11.49% complete
Parsing 1051 of 8710 units - 12.07% complete
Parsing 1101 of 8710 units - 12.64% complete


In [14]:
era_av.head()

Unnamed: 0,Star League (2571 - 2780),Early Succession War (2781 - 2900),Late Succession War - LosTech (2901 - 3019),Late Succession War - Renaissance (3020 - 3049),Clan Invasion (3050 - 3061),Civil War (3062 - 3067),Jihad (3068 - 3085),Early Republic (3086 - 3100),Late Republic (3101 - 3130),Dark Ages (3131 - 3150),ilClan (3151 - 9999),Name
0,Unknown,Unknown,Unknown,Unknown,Extinct,Extinct,Extinct,Extinct,Extinct,Extinct,Extinct,Cameroon
1,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Daemon
2,Unknown,Unknown,Unknown,Lyran Commonwealth,Lyran Alliance,Lyran Alliance,Lyran Alliance,Extinct,Extinct,Extinct,Extinct,Hatchetman HCT-3G
3,"Free Worlds League, Unique",Extinct,Extinct,Extinct,Extinct,Extinct,Extinct,Extinct,Extinct,Extinct,Extinct,Hermes II HER-2X
4,Unknown,Unknown,Unknown,Unknown,"Clan Jade Falcon, Unique",Extinct,Extinct,Extinct,Extinct,Extinct,Extinct,Phoenix Hawk LAM C


Join both datasets to get final one:

In [15]:
unitlist = unitlist.join(era_av.set_index('Name'), on='Name', how="left")
unitlist.head()

Unnamed: 0,Name,Class,Variant,Role,PV,Type,Size,Move,Short,ShortMin,...,Early Succession War (2781 - 2900),Late Succession War - LosTech (2901 - 3019),Late Succession War - Renaissance (3020 - 3049),Clan Invasion (3050 - 3061),Civil War (3062 - 3067),Jihad (3068 - 3085),Early Republic (3086 - 3100),Late Republic (3101 - 3130),Dark Ages (3131 - 3150),ilClan (3151 - 9999)
0,Celerity CLR-02-X-D,Celerity,CLR-02-X-D,Scout,15,BM,1,"40""",0,False,...,Unknown,Unknown,Unknown,ComStar,ComStar,ComStar,Extinct,Extinct,Extinct,Extinct
1,Celerity CLR-03-O,Celerity,CLR-03-O,Scout,15,BM,1,"40""",0,False,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Republic of the Sphere,Mercenary
2,Celerity CLR-03-OA,Celerity,CLR-03-OA,Scout,16,BM,1,"40""",0,True,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Republic of the Sphere,Mercenary
3,Celerity CLR-03-OB,Celerity,CLR-03-OB,Scout,16,BM,1,"40""",0,False,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Republic of the Sphere,Mercenary
4,Celerity CLR-03-OC,Celerity,CLR-03-OC,Scout,16,BM,1,"40""",0,False,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Republic of the Sphere,Mercenary


Write dataset into csv file:

In [16]:
path = f"unit_list.csv"
unitlist.to_csv(path, index=False)
print("done")

done
