In [None]:
import pandas as pd
import altair as alt
import polars as pl
from matplotlib.axes import Axes
import seaborn as sns
import geopandas
from shapely import wkt
from shapely.geometry import LineString, MultiLineString
import folium
import geodatasets
from folium.plugins import TimestampedGeoJson



In [None]:
import altair as alt
import polars as pl

In [None]:
def filter_cities_by_country(
        ldf: pl.LazyFrame,
        country:str
        ) -> pl.LazyFrame:
    return ldf.filter(pl.col('country') == country)

def join_on_city_id(
        left_ldf: pl.LazyFrame,
        right_ldf: pl.LazyFrame
        ) -> pl.LazyFrame:
    return left_ldf.join(
        right_ldf,
        left_on='city_id',
        right_on='id',
        how='left'
        )

def drop_empty_country_rows(ldf: pl.LazyFrame) -> pl.LazyFrame:
    return ldf.drop_nulls(subset='country')

def rename_buildstart_to_start(ldf: pl.LazyFrame) -> pl.LazyFrame:
    return ldf.rename({'buildstart': 'start'})

def rename_name_right_to_city(ldf: pl.LazyFrame) -> pl.LazyFrame:
    return ldf.rename({'name_right':'city'})

def drop_empty_start(ldf: pl.LazyFrame) -> pl.LazyFrame:
    return ldf.filter(pl.col('start') > 0)

def change_start_to_datetime(ldf: pl.LazyFrame) -> pl.LazyFrame:
    return ldf.with_columns(
        pl.datetime(pl.col('start'),1,1).alias('date_start')
    )

In [368]:
class CitiesStations():
    def __init__(self, country: str):
        self.cities: pl.LazyFrame = pl.scan_csv('data/cities.csv')
        self.stations: pl.LazyFrame = pl.scan_csv('data/stations.csv')
        self.country: str = country
        self.ldf: pl.LazyFrame = self._build_ldf()
        self.df: pl.DataFrame | None = None

    def _build_ldf(self) -> pl.LazyFrame:
        return(
            filter_cities_by_country(self.cities, self.country)
            .pipe(lambda ldf: join_on_city_id(self.stations, ldf))
            .pipe(rename_name_right_to_city)
            .pipe(rename_buildstart_to_start)
            .pipe(drop_empty_country_rows)
            .pipe(drop_empty_start)
            .pipe(change_start_to_datetime)
        )

    def _collect_df(self, ldf: pl.LazyFrame, reset: bool = True) -> pl.DataFrame:
        if reset or self.df is None:
            self.df = ldf.collect()
        return self.df

    def show_graph(self) -> None:
        self.ldf.show_graph()

    def _start_count(self) -> pl.LazyFrame:
        return self.ldf.group_by(['city', 'start']).agg(
            pl.len()).sort(['city', 'start']).with_columns(
                pl.col('len').cum_sum().over('city').alias('cumulative')
            )

    def line_chart(self, file: str | None = None) -> alt.Chart:
        ldf = self._start_count()
        df = self._collect_df(ldf)
        chart: alt.Chart = df.plot.line(
            x='start',
            y='cumulative',
            color='city'
            )
        if file:
            chart.save(file) # type: ignore
        return chart

    def histogram(self, stacked: bool = False, file:str | None = None) -> alt.Chart:
        df = self._collect_df(self.ldf)
        chart = alt.Chart(df).mark_bar( # type: ignore
        ).encode(
            x=alt.X(
                'start:Q',
                bin={'maxbins': 20},
                axis=alt.Axis(format='d', title='Year')
            ),
            y=alt.Y('count()', stack='zero',
                    axis=alt.Axis(title='Numer of Stations Built')
            )
        ).properties(
            title='Urban Rail Built in the US 1832-Present'
        )

        if stacked:
            chart = chart.encode(color=alt.Color('city:N'))

        if file:
            chart.save(file) # type: ignore
        return chart



In [366]:
cs_usa = CitiesStations("United States")
cs_usa.line_chart()

In [367]:
cs_usa.histogram()

In [365]:
cs_usa.histogram(stacked=True)

In [None]:
cities
cities['Coordinates'] = geopandas.GeoSeries.from_wkt(cities['coords'])
gdf = geopandas.GeoDataFrame(cities, geometry='Coordinates')

In [None]:
gdf = gdf.set_crs("EPSG:4326")
m = folium.Map(location=[40.70, -73.94], zoom_start=10, tiles="CartoDB positron")

for _, r in gdf.iterrows():
    lat =r["Coordinates"].y
    lon = r['Coordinates'].x
    folium.Marker(
        location=[lat, lon],
    ).add_to(m)

In [None]:
m.save('Images/map.html')

In [None]:
lines = pd.read_csv('data/lines.csv')

In [None]:
lines = lines.rename(columns={'name': 'line_name'})

In [None]:
lst = ['data/station_lines.csv', 'data/stations.csv', 'data/systems.csv', 'data/track_lines.csv', 'data/tracks.csv']
station_lines = pd.read_csv(lst[0])
stations = pd.read_csv(lst[1])
systems = pd.read_csv(lst[2])
track_lines = pd.read_csv(lst[3])
tracks = pd.read_csv(lst[4])

In [None]:
tracks

In [None]:
lines_drop = lines.copy()
lines_drop = lines_drop.drop(columns=['url_name', 'color', 'transport_mode_id'])

In [None]:
lines_drop

In [None]:
cities_drop = cities.copy()
cities_drop = cities_drop.drop(columns='url_name')
cities_usa = cities_drop[cities_drop['country'] == "United States"]

In [None]:
cities_lines = lines_drop.merge(cities_usa, left_on='city_id', right_on='id', how='left')

In [None]:
cities_lines

In [None]:
cities_lines = cities_lines.rename(columns={'name':'city', 'city_id': 'id'}).dropna(subset='city')

In [None]:
cities_lines

In [None]:
cities_lines

In [None]:
cities_lines = cities_lines.drop(columns=['id_x', 'id_y', 'system_id'])

In [None]:
cities_lines_stations = stations.merge(cities_lines, left_on='city_id', right_on='id', how='left')

In [None]:
cities_lines_stations = cities_lines_stations.dropna(subset='city')

In [None]:
cities_lines_stations.sort_values(by='city_id')

In [None]:
cls = cities_stations.rename(columns={'Coordinates': 'City Coordinates'})

In [None]:
cls['Station Coordinates'] = geopandas.GeoSeries.from_wkt(cls['geometry'])
gdf = geopandas.GeoDataFrame(cls, geometry='Station Coordinates')

In [None]:
tracks['geometry'] = tracks['geometry'].apply(wkt.loads)

In [None]:
#tracks['tracks_coords'] = geopandas.GeoSeries.from_wkt(cls['geometry'])

In [None]:
#tracks = tracks.drop(columns='tracks_coords')


In [None]:
tracks_gdf = geopandas.GeoDataFrame(tracks)

In [None]:
tracks_gdf= tracks_gdf.set_crs("EPSG:4326")

In [None]:
tracks_gdf

In [None]:
def build_map(gdf, column, m):
    gdf = gdf.set_crs("EPSG:4326")
    

    for _, r in gdf.iterrows():
        lat = r[column].y
        lon = r[column].x
        folium.Marker(
            location=[lat, lon],
        ).add_to(m)
    return m
   


def add_lines(gdf, column, m):
    gdf = gdf.to_crs('EPSG:4326')

    for geom in gdf[column]:
        if geom is None or geom.is_empty:
            continue

        if isinstance(geom, LineString):
            coords = list(geom.coords)
            if not coords:
                continue 
            coords = [(lat, lon) for lon, lat in coords]
            folium.PolyLine(locations=coords, color='purple').add_to(m)

        elif isinstance(geom, MultiLineString):
            for line in geom:
                coords = list(line.coords)
                if not coords:
                    continue
                coords = [(lat, lon) for lon, lat in coords]
                folium.PolyLine(locations=coords, color='purple').add_to(m)

        else:
            print(f"Skipping unsupported geometry: {type(geom)}")

    return m

def time_map(df,start,stop, column, m, kind):
    if kind == 'point':
        df = df[(df['buildstart'] > start) & (df['buildstart'] <=stop)]
        return build_map(df,column, m)
    if kind == 'line': 
        df = df[(df['buildstart'] > start) & (df['buildstart'] <=stop)]
        return add_lines(df,column, m)
m = folium.Map(location=[40.70, -73.94], zoom_start=10, tiles="CartoDB positron")
start = 1830
stop = 2030
while start <= 2030:
    m = time_map(gdf, start, start+20, 'City Coordinates', m, 'point')
    m = time_map(gdf, start, start+20, 'Station Coordinates', m, 'point')
    m = time_map(tracks_gdf, start, start+20, 'geometry', m, 'line')
    m.save(f'Images/{start}.html')
    start+=20


In [None]:
gdf['buildstart'] = pd.to_datetime(gdf['buildstart'], format='%Y')

In [None]:
tracks_gdf

In [None]:
tracks_gdf['buildstart'] = pd.to_datetime(tracks_gdf['buildstart'], format='%Y')

In [None]:
def build_features(gdf, column, m):
    
def build_map(gdf, column, m):
    gdf = gdf.set_crs("EPSG:4326")
    

    for _, r in gdf.iterrows():
        lat = r[column].y
        lon = r[column].x
        folium.Marker(
            location=[lat, lon],
        ).add_to(m)
    return m
   


def add_lines(gdf, column, m):
    gdf = gdf.to_crs('EPSG:4326')

    for geom in gdf[column]:
        if geom is None or geom.is_empty:
            continue

        if isinstance(geom, LineString):
            coords = list(geom.coords)
            if not coords:
                continue 
            coords = [(lat, lon) for lon, lat in coords]
            folium.PolyLine(locations=coords, color='purple').add_to(m)

        elif isinstance(geom, MultiLineString):
            for line in geom:
                coords = list(line.coords)
                if not coords:
                    continue
                coords = [(lat, lon) for lon, lat in coords]
                folium.PolyLine(locations=coords, color='purple').add_to(m)

        else:
            print(f"Skipping unsupported geometry: {type(geom)}")

    return m

def time_map(df,start,stop, column, m, kind):
    if kind == 'point':
        df = df[(df['buildstart'] > start) & (df['buildstart'] <=stop)]
        return build_map(df,column, m)
    if kind == 'line': 
        df = df[(df['buildstart'] > start) & (df['buildstart'] <=stop)]
        return add_lines(df,column, m)
m = folium.Map(location=[40.70, -73.94], zoom_start=10, tiles="CartoDB positron")
start = 1830
stop = 2030
while start <= 2030:
    m = time_map(gdf, start, start+20, 'City Coordinates', m, 'point')
    m = time_map(gdf, start, start+20, 'Station Coordinates', m, 'point')
    m = time_map(tracks_gdf, start, start+20, 'geometry', m, 'line')
    m.save(f'Images/{start}.html')
    start+=20

In [None]:
m = folium.Map(location=[40.70, -73.94], zoom_start=10, tiles="CartoDB positron")
m = build_map(gdf, 'City Coordinates', m)
m = build_map(gdf, 'Station Coordinates', m)
m = add_lines(tracks_gdf, 'geometry', m)

In [None]:
m.save('Images/map.html')

In [None]:
cities_id = cities_usa.rename(columns={'id':'city_id'})

In [None]:
cities_line = lines_drop.merge(citi)

In [None]:
cities_line

In [None]:
stations

In [None]:
stations_drop = stations.copy()
#stations_usa = stations_drop[stations_drop['country'] == "United States"]
#stations_drop['cities'] = stations_drop[[stations_drop['city_id']== 147]] = 'Chicago'
stations_drop = stations_drop.dropna(subset='buildstart')
stations_drop = stations_drop[stations_drop['buildstart'] == 0] = np.nan

In [None]:
stations_drop