In [None]:
import os
import json
import pycountry
import pandas as pd
import plotly.express as px

In [None]:
path_companies = "data/scraped/yahoo/sectors/index_stocks.json"
path_sectors = "data/scraped/yahoo/sectors/all_sectors.json"
path_merged = "data/scraped/yahoo/sectors/companies_sectors_geo.csv"
path_output_root = "data/analysis/yahoo/geo"
if not os.path.exists(path_output_root):
	os.makedirs(path_output_root)

In [None]:
companies = json.load(open(path_companies, "r"))
companies = {c["symbol"]: c for c in companies}
print(f"Loaded '{len(companies)}' companies")
# Filter by companies that have (list) "profile" key and "profile" has "geo_google" key
companies = {
    k: v
    for k, v in companies.items()
    if "profile" in v and "geo_google" in v["profile"]
}
print(f"Filtered to '{len(companies)}' companies with valid data")

In [None]:
# Parse country from profile address (last line)
for k, v in companies.items():
	# splitting by \n\n solves some html sneaking in
	v["profile"]["country"] = v["profile"]["address"].split("<a")[0].strip().split(
	    "\n")[-1].strip().lower()

In [None]:
# Example "MSFT" company data
msft_profile = companies["MSFT"]["profile"]
print(json.dumps(msft_profile, indent=4))
msft_country = msft_profile["geo_google"]["country"]
print(f"Country name: '{msft_country}'")

In [None]:
country_fuzzy_memo = {}


def try_get_country(company_profile: dict) -> str:
	'''
		Tries to get country from company profile
	'''
	try:
		country_address = company_profile["country"]
		if country_address in country_fuzzy_memo:
			return country_fuzzy_memo[country_address]
		country_name = pycountry.countries.search_fuzzy(country_address)[0].name
		country_fuzzy_memo[country_address] = country_name
		return country_name
	except:
		pass
	try:
		country_geo = company_profile["geo_google"]["country"]
		if country_geo in country_fuzzy_memo:
			return country_fuzzy_memo[country_geo]
		country_name = pycountry.countries.search_fuzzy(country_geo)[0].name
		country_fuzzy_memo[country_geo] = country_name
		return country_name
	except:
		pass
	return ""


# Print 2-letter and 3-letter country codes as well as country name using pycountry
# country = pycountry.countries.search_fuzzy("United States")[0]
msft_country_name = try_get_country(msft_profile)
msft_country = pycountry.countries.get(name=msft_country_name)
print(f"Country 2-letter code: '{msft_country.alpha_2}'")
print(f"Country 3-letter code: '{msft_country.alpha_3}'")
print(f"Country name: '{msft_country.name}'")

In [None]:
print(f"Fuzzy country search memoization has '{len(country_fuzzy_memo)}' countries:")
print(json.dumps(country_fuzzy_memo, indent=4))

In [None]:
country_names = {}
countries_unknown = []
for i, (k, v) in enumerate(companies.items()):
	print(f"Company {i}/{len(companies)}: {k}", end="\r")
	country_name = try_get_country(v["profile"])
	if country_name:
		country_names[k] = country_name
	else:
		countries_unknown.append(k)
print("\n\n")
print(f"Found '{len(country_names)}' country names")
print(f"Found '{len(countries_unknown)}' unknown country names")


In [None]:
# Print symbols and addresses for companies with unknown country names
print("Unknown country names:")
for k in countries_unknown:
	print(f"{k}: '{companies[k]['profile']['country']}'")

In [None]:
# Print country counts (descending)
country_counts = {}
for k, v in country_names.items():
	if v not in country_counts:
		country_counts[v] = 0
	country_counts[v] += 1
country_counts_sorted = sorted(country_counts.items(),
                        key=lambda x: x[1],
                        reverse=True)
print("Country counts:")
for k, v in country_counts_sorted:
	print(f"{k}: {v}")

In [None]:
# Save validated country names to companies (profile -> country_valid)
for k, v in companies.items():
	if k in country_names:
		v["profile"]["country_valid"] = country_names[k]

In [None]:
sectors = json.load(open(path_sectors, "r"))["sectors"]

In [None]:
# NOTE: sectors tree data has duplicates - use company profile sector and industry instead

# # this is a list of sectors - each sector has a list of industries and each industry has a list of companies and each company has a symbol
# # add "sector" and "industry" to each company
# # clear out "sector" and "industry" from each company
# for k, v in companies.items():
# 	del v["sector"]
# 	del v["industry"]

# company_industry = {}
# total_count = 0 # this includes duplicates

# # TODO: fix by adding "sector" and "industry" to each company from profile data
# # need to rerun the scraper to get this data

# for sector in sectors:
# 	for industry in sector["industries"]:
# 		# print(f"Industry {industry['name']} has {len(industry['companies'])} companies")
# 		total_count += len(industry['companies'])
# 		# if industry["name"] == "Software - Infrastructure":
# 		# 	here = 0
# 		for company in industry["companies"]:
# 			if company["symbol"] not in companies:
# 				continue
# 			# if company["symbol"] == "MSFT":
# 			# 	here = 0
# 			companies[company["symbol"]]["sector"] = sector["name"]
# 			companies[company["symbol"]]["industry"] = industry["name"]

# print(f"Total count: {total_count} (includes duplicates)")

In [None]:
# Clear out "sector" and "industry" from each company
for k, v in companies.items():
	try:
		del v["sector"]
	except:
		pass
	try:
		del v["industry"]
	except:
		pass

# Add "sector" and "industry" to each company from profile data
for k, v in companies.items():
	if "sector" in v["profile"]:
		v["sector"] = v["profile"]["sector"].strip()
	if "industry" in v["profile"]:
		v["industry"] = v["profile"]["industry"].strip()

# Count the number of companies without sector, without industry, and without either
companies_no_sector = []
companies_no_industry = []
companies_no_sector_industry = []
for k, v in companies.items():
	if "sector" not in v:
		companies_no_sector.append(k)
	if "industry" not in v:
		companies_no_industry.append(k)
	if "sector" not in v and "industry" not in v:
		companies_no_sector_industry.append(k)
print(f"Companies without sector: {len(companies_no_sector)}")
print(f"Companies without industry: {len(companies_no_industry)}")
print(
    f"Companies without sector or industry: {len(companies_no_sector_industry)}"
)


In [None]:
# NOTE: if any sectors / industries are missing you can use the sectors tree data to fill them in - it is not perfect but it is better than nothing

In [None]:
msft = companies["MSFT"]
print(json.dumps(msft, indent=4))

In [None]:
#

In [None]:
# Convert market cap to float
def get_market_cap(company: dict) -> float:
	try:
		market_cap_str = company["market_cap"]
		letter = market_cap_str[-1]
		market_cap = float(market_cap_str[:-1])
		if letter == "T":
			market_cap *= 10 ** 12
		elif letter == "B":
			market_cap *= 10 ** 9
		elif letter == "M":
			market_cap *= 10 ** 6
		elif letter == "K":
			market_cap *= 10 ** 3
		else:
			# raise Exception(f"Unknown market cap letter '{letter}'")
			print(f"Unknown market cap letter '{letter}'")
		return float(market_cap)
	except:
		return -1
	
for k, v in companies.items():
	v["market_cap_float"] = get_market_cap(v)

# Print number of companies with -1 market cap
companies_without_market_cap = [k for k, v in companies.items() if v["market_cap_float"] == -1]
print(f"Companies without market cap: '{len(companies_without_market_cap)}'")
for k in companies_without_market_cap:
	print(k)

In [None]:
# Create dataframe with company symbols, names, sectors, industries, countries, latitudes, longitudes

df = pd.DataFrame(columns=[
    "symbol", "name", "sector", "industry", "market_cap", "employees", "country", "country_code", "latitude", "longitude"
])

for k, v in companies.items():
	if "profile" not in v or "geo_google" not in v["profile"]:
		continue
	profile = v["profile"]
	employees = -1
	try:
		employees = int(profile["employees"])
	except:
		# print(f"Could not parse employees for '{k}' - '{profile['employees']}'")
		pass
	row_object = {
	    "symbol": k,
	    "name": v["name"],
	    "sector": v["sector"],
	    "industry": v["industry"],
			"market_cap": v["market_cap_float"],
	    "employees": employees,
	    "country": profile["country_valid"],
	    "country_code": pycountry.countries.get(name=profile["country_valid"]).alpha_3,
	    "latitude": profile["geo_google"]["latitude"],
	    "longitude": profile["geo_google"]["longitude"]
	}
	df = pd.concat([df, pd.DataFrame([row_object])], ignore_index=True)

# Set market_cap, employees, latitude, longitude to numeric
df["market_cap"] = pd.to_numeric(df["market_cap"])
df["employees"] = pd.to_numeric(df["employees"])
df["latitude"] = pd.to_numeric(df["latitude"])
df["longitude"] = pd.to_numeric(df["longitude"])

# Change -1 employees to NaN
df["employees"] = df["employees"].replace(-1, float("nan"))
# Change -1 market_cap to NaN
df["market_cap"] = df["market_cap"].replace(-1, float("nan"))

df.head()


In [None]:
# Save dataframe to csv
df.to_csv(path_merged, index=False)

In [None]:
# Use px.choropleth to plot country counts
three_letter_codes = [pycountry.countries.get(name=k).alpha_3 for k, v in country_counts.items()]
df = pd.DataFrame({
    "country": list(country_counts.keys()),
    "count": list(country_counts.values()),
    "code": three_letter_codes
})
# remove united states from df
df_edit = df[df["country"] != "United States"]
# df_edit = df
fig = px.choropleth(df_edit,
                    locations="code",
                    color="count",
                    hover_name="country",
                    color_continuous_scale=px.colors.sequential.Plasma)
fig.update_layout(title_text=f"Company counts by country (excluding United States which has {country_counts['United States']} companies)",
																		title_x=0.5,
																		title_y=0.965)
fig.update_layout(width=1280, height=720)
# layout margin
fig.update_layout(margin=dict(l=10, r=0, t=0, b=0))
fig.show()

path_output = os.path.join(path_output_root, "company_counts_by_country.png")
fig.write_image(path_output, scale=2)