In [None]:
import os
import math
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

In [None]:
path_merged = "data/scraped/yahoo/sectors/companies_sectors_geo.csv"
path_output_root = "data/analysis/yahoo/geo"

if not os.path.exists(path_output_root):
	os.makedirs(path_output_root)

In [None]:
def get_data():
	return pd.read_csv(path_merged)


df = get_data()
df.head()

In [None]:
# Print number of companies without employees (NaN)

df = get_data()

# Rename Software—Application to Software - Application in industry column
df["industry"] = df["industry"].replace("Software—Application", "Software - Application")

# Remove all where country != "United States"
df = df[df["country"] == "United States"]

# Remove all where latitude > 25.8
df = df[df["latitude"] > 25.8]

# Remove all where longitude < -125
df = df[df["longitude"] > -125]

print(
    f"Companies without employees data: {df['employees'].isna().sum()} / {len(df)}"
)

size_max = 14 * 3

# Set the employees to 1 if NaN
df["employees"] = df["employees"].fillna(100) # so it's visible on the plot

# Set market cap to 1 if NaN
df["market_cap"] = df["market_cap"].fillna(1000000) # so it's visible on the plot

employees_max = df["employees"].max()
company_max = df["symbol"][df["employees"].idxmax()]
print(f"Max employees: {employees_max} by '{company_max}'")
# Scale employees to [0, 1] logarithmically
df["employees"] = df["employees"].apply(
    lambda x: math.log(x) / math.log(employees_max))

# # Scale market cap to [0, 1] logarithmically
# market_cap_max = df["market_cap"].max()
# company_max_market_cap = df["symbol"][df["market_cap"].idxmax()]
# print(f"Max market cap: {market_cap_max} by '{company_max_market_cap}'")
# df["market_cap"] = df["market_cap"].apply(
# 		# lambda x: math.log(x, 1.001) / math.log(market_cap_max))
# 	# lambda x: math.log(x))
# 	# lambda x: math.log(x) / math.log(market_cap_max) * size_max)
# 	# scaled_market_caps_normalized = (scaled_market_caps - min(scaled_market_caps)) / (max(scaled_market_caps) - min(scaled_market_caps))
# 	lambda x: (math.log(x) - math.log(df["market_cap"].min())) / (math.log(df["market_cap"].max()) - math.log(df["market_cap"].min())) * size_max)

# Scale market cap to a circle radius using current market cap as area of the circle where the max market cap is the max area
market_cap_max = df["market_cap"].max()
company_max_market_cap = df["symbol"][df["market_cap"].idxmax()]
print(f"Max market cap: {market_cap_max} by '{company_max_market_cap}'")
scale_factor = 5
df["market_cap"] = df["market_cap"].apply(
		lambda x: math.sqrt(x) / math.sqrt(market_cap_max) * size_max * scale_factor)


df.head()

In [None]:
# Get company counts by sector
sector_counts = df["sector"].value_counts().reset_index()
# print sorted sector counts
print(sector_counts.sort_values(by="count", ascending=False))

In [None]:
# Remove Utilities and Communication Services from df - improve plot readability
df = df[df["sector"] != "Utilities"]
df = df[df["sector"] != "Communication Services"]

In [None]:
# Plot company locations on world map - each scatter plot point is a company with a color corresponding to its sector
# size the points by number of employees, if -1 then size of 10
# https://plotly.com/python/scatter-plots-on-maps/

size_max = 70

fig = px.scatter_geo(
    df,
    lat="latitude",
    lon="longitude",
    color="sector",
  # color="industry",
    hover_name="name",
    # size="employees",
  size="market_cap",
  # size_max=14,
  size_max=size_max,
    projection="natural earth")
fig.update_layout(title="Contiguous United States Companies by Sector")

fig.update_layout(geo=dict(
    lataxis_range=[0, 100],
    lonaxis_range=[-100, 100],
    projection_scale=1,
    center=dict(lat=40, lon=5),
    projection_rotation=dict(lat=0, lon=0, roll=0),
    fitbounds="locations",
    # visible=False
))
# make image 1280x720
fig.update_layout(width=1280, height=720)
# margins
fig.update_layout(margin=dict(l=10, r=10, t=10, b=0))
# legend location within the plot bottom right
fig.update_layout(legend=dict(x=0.83, y=0.1))
# title location top center
fig.update_layout(title=dict(x=0.5, y=0.98))
# draw country borders
fig.update_geos(showcountries=True, countrycolor="Black", countrywidth=1)
fig.show()

# Save the plot as PNG
# dpi = 300
# scalar = 0.3
# fig.write_image(f"{path_output_root}/company_locations_by_sector.png",
#                 width=15 * dpi * scalar,
#                 height=7.5 * dpi * scalar,
#                 scale=3)
# scale=3)

path_output = os.path.join(path_output_root, "company_locations_by_sector.png")
fig.write_image(path_output, scale=2)

In [None]:
# Get counts of technology companies by industry
tech_counts = df[df["sector"] == "Technology"]["industry"].value_counts().reset_index()
print(tech_counts.sort_values(by="count", ascending=False))

In [None]:
# Remove Consumer Electronics, Electronics & Computer Distribution, Solar
# df = df[df["industry"] != "Consumer Electronics"]
df = df[df["industry"] != "Electronics & Computer Distribution"]
df = df[df["industry"] != "Solar"]

In [None]:
# Filter a new df with only companies from Technology sector and plot it
# sector = "Technology"
sector = "Technology"
df_tech = df[df["sector"] == sector]

# fig = px.scatter_geo(
#     df_tech,
#     lat="latitude",
#     lon="longitude",
#     color="industry",
#     # color="industry",
#     hover_name="name",
#     # size="employees",
#     size="market_cap",
#     # size_max=14,
#     size_max=size_max,
#     projection="natural earth")

# visible_industries = [
#     "Semiconductors",
#     "Consumer Electronics",
#     "Communication Equipment",
#     "Semiconductor Equipment & Materials",
#     "Computer Hardware",
#     "Electronic Components",
#     "Scientific & Technical Instruments",
#     "Electronics & Computer Distribution",
# ]

# Remove all that are not in visible_industries from df_tech
# df_tech = df_tech[df_tech["industry"].isin(visible_industries)]


# Change color to a different color for each industry
# https://plotly.com/python/discrete-color/
# fig = px.scatter_geo(
# scatter US map
fig = px.scatter_geo(
    df_tech,
    lat="latitude",
    lon="longitude",
    color="industry",
    # color="industry",
    hover_name="name",
    # size="employees",
    size="market_cap",
    # size_max=14,
    size_max=size_max,
    projection="natural earth",
    color_discrete_sequence=px.colors.qualitative.Plotly)

# # Make all industries hidden by default
# # https://plotly.com/python/legend/
# fig.for_each_trace(lambda trace: trace.update(visible="legendonly")
#                    if trace.name != "" else ())

# # Make the industries in visible_industries visible
# for industry in visible_industries:
# 	fig.for_each_trace(lambda trace: trace.update(visible=True)
# 	                   if trace.name == industry else ())

# # Remove all that are not in visible_industries altogether
# fig.for_each_trace(lambda trace: trace.update(visible=False)
#           if trace.name not in visible_industries else ())

# Zoom world map to a specific region
# https://plotly.com/python/axes/
# fig.update_layout(geo=dict(
#     lataxis_range=[0, 100],
#     lonaxis_range=[-100, 100],
#     projection_scale=1,
#     center=dict(lat=40, lon=5),
#     projection_rotation=dict(lat=0, lon=0, roll=0),
#     # fitbounds="locations",
#     # visible=False
# ))

# Zoom on the US
fig.update_layout(geo=dict(
    lataxis_range=[20, 50],
    lonaxis_range=[-130, -60],
    projection_scale=1,
    center=dict(lat=40, lon=-100),
    projection_rotation=dict(lat=0, lon=0, roll=0),
    fitbounds="locations",
    # visible=False
))

fig.update_layout(title=f"Contiguous United States {sector} Sector Companies by Industry")

# make image 1280x720
fig.update_layout(width=1280, height=720)
# margins
fig.update_layout(margin=dict(l=10, r=10, t=10, b=0))
# legend location within the plot bottom right
fig.update_layout(legend=dict(x=0.83, y=0.1))
# title location top center
fig.update_layout(title=dict(x=0.5, y=0.975))
# draw country borders
fig.update_geos(showcountries=True, countrycolor="Black", countrywidth=1)

fig.show()

# # Save the plot as PNG
# dpi = 300
# scalar = 0.3
# fig.write_image(f"{path_output_root}/company_locations_by_industry.png",
# 								width=15 * dpi * scalar,
# 								height=7.5 * dpi * scalar,
# 								scale=3)

path_output = os.path.join(path_output_root, "company_locations_by_industry.png")
fig.write_image(path_output, scale=2)