In [10]:
import pandas as pd
import featuretools as ft
import warnings
warnings.simplefilter('ignore')

In [2]:
data_root = "https://raw.githubusercontent.com/a-milenkin/Competitive_Data_Science/main/data/"

path = data_root + 'car_train.csv'
car_info = pd.read_csv(path)

path = data_root + 'rides_info.csv'
rides_info = pd.read_csv(path)

path = data_root + 'driver_info.csv'
driver_info = pd.read_csv(path)

path = data_root + 'fix_info.csv'
fix_info = pd.read_csv(path)

In [16]:
es = ft.EntitySet(id="car_data")

In [17]:
from woodwork.logical_types import Categorical, Double, Datetime, Age

es = es.add_dataframe(
    dataframe_name="cars",
    dataframe=car_info,
    index="car_id",
    logical_types={"car_type": Categorical, 'fuel_type': Categorical, 'model': Categorical}
    )

es = es.add_dataframe(
    dataframe_name="rides",
    dataframe=rides_info.drop(['ride_id'], axis=1),
    index='index',
    time_index="ride_date",
    )

es = es.add_dataframe(
    dataframe_name="drivers",
    dataframe=driver_info,
    index="user_id",
    logical_types={"sex": Categorical, "first_ride_date": Datetime, "age": Age}
    )

es = es.add_dataframe(
    dataframe_name="fixes",
    dataframe=fix_info,
    index="index",
    logical_types={"work_type": Categorical, "worker_id":Categorical}
    )

In [18]:
es

Entityset: car_data
  DataFrames:
    cars [Rows: 2337, Columns: 10]
    rides [Rows: 739500, Columns: 14]
    drivers [Rows: 15153, Columns: 7]
    fixes [Rows: 146000, Columns: 7]
  Relationships:
    No relationships

In [19]:
es = es.add_relationship("cars", "car_id", "rides", "car_id")
es = es.add_relationship("drivers", "user_id", "rides", "user_id")
es = es.add_relationship("cars", "car_id", "fixes", "car_id")

In [20]:
es

Entityset: car_data
  DataFrames:
    cars [Rows: 2337, Columns: 10]
    rides [Rows: 739500, Columns: 14]
    drivers [Rows: 15153, Columns: 7]
    fixes [Rows: 146000, Columns: 7]
  Relationships:
    rides.car_id -> cars.car_id
    rides.user_id -> drivers.user_id
    fixes.car_id -> cars.car_id

In [37]:
feature_matrix, feature_defs = ft.dfs(
    entityset=es,
    target_dataframe_name="cars",
    max_depth=2,
    agg_primitives=["mean", "sum", "count", "max"],
)

In [38]:
feature_matrix.head()

Unnamed: 0_level_0,model,car_type,fuel_type,car_rating,year_to_start,riders,year_to_work,target_reg,target_class,COUNT(rides),...,MAX(rides.drivers.user_rides),MAX(rides.drivers.user_time_accident),MEAN(rides.drivers.age),MEAN(rides.drivers.user_rating),MEAN(rides.drivers.user_rides),MEAN(rides.drivers.user_time_accident),SUM(rides.drivers.age),SUM(rides.drivers.user_rating),SUM(rides.drivers.user_rides),SUM(rides.drivers.user_time_accident)
car_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
y13744087j,Kia Rio X-line,economy,petrol,3.78,2015,76163,2021,108.53,another_bug,174,...,2626.0,77.0,33.511494,8.229885,828.034483,17.724138,5831.0,1432.0,144078.0,2056.0
O41613818T,VW Polo VI,economy,petrol,3.9,2015,78218,2021,35.2,electro_bug,174,...,2821.0,23.0,34.988506,7.988506,924.804598,6.965517,6088.0,1390.0,160916.0,1212.0
d-2109686j,Renault Sandero,standart,petrol,6.3,2012,23340,2017,38.62,gear_stick,174,...,2617.0,25.0,32.83908,7.843103,940.04023,9.775862,5714.0,1364.7,163567.0,1701.0
u29695600e,Mercedes-Benz GLC,business,petrol,4.04,2011,1263,2020,30.34,engine_fuel,174,...,2626.0,86.0,34.977011,8.524138,951.126437,19.991304,6086.0,1483.2,165496.0,2299.0
N-8915870N,Renault Sandero,standart,petrol,4.7,2012,26428,2017,30.45,engine_fuel,174,...,2481.0,72.0,34.298851,8.112069,896.229885,15.758333,5968.0,1411.5,155944.0,1891.0


In [39]:
feature_matrix = feature_matrix[feature_matrix['model'].str.lower().str.contains('tesla')]

In [47]:
feature_matrix['SUM(fixes.work_duration)'].max()

970.0

In [159]:
import geopandas as gpd
from sklearn.datasets import fetch_california_housing
from scipy.spatial.distance import euclidean
import requests, zipfile, io

In [160]:
county_fname = 'https://github.com/a-milenkin/Competitive_Data_Science/raw/main/data/ca-county-boundaries.zip'

r2 = requests.get(county_fname)
z2 = zipfile.ZipFile(io.BytesIO(r2.content))
z2.extractall("./ca")

In [161]:
ca_counties=gpd.read_file('./ca/CA_Counties').to_crs("EPSG:3857")
ca_counties.head(3)

Unnamed: 0,STATEFP,COUNTYFP,COUNTYNS,GEOID,NAME,NAMELSAD,LSAD,CLASSFP,MTFCC,CSAFP,CBSAFP,METDIVFP,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON,geometry
0,6,91,277310,6091,Sierra,Sierra County,6,H1,G4020,,,,A,2468694587,23299110,39.5769252,-120.5219926,"POLYGON ((-13431319.751 4821511.426, -13431312..."
1,6,67,277298,6067,Sacramento,Sacramento County,6,H1,G4020,472.0,40900.0,,A,2499183617,76073827,38.4500114,-121.3404409,"POLYGON ((-13490651.476 4680831.603, -13490511..."
2,6,83,277306,6083,Santa Barbara,Santa Barbara County,6,H1,G4020,,42200.0,,A,7084000598,2729814515,34.5370572,-120.0399729,"MULTIPOLYGON (((-13423116.772 4042044.149, -13..."


In [184]:
df = fetch_california_housing(as_frame=True).data
df = gpd.GeoDataFrame(
        df,
        geometry=gpd.points_from_xy(df['Longitude'], df['Latitude']),
        crs=4326
    ).to_crs(epsg=3857)

In [185]:
gdf = gpd.overlay(df, ca_counties[['NAME', 'geometry']], how='intersection')

In [186]:
df = gdf[gdf['NAME'].str.startswith('Los')]

In [187]:
df.reset_index(drop=True, inplace=True)

In [188]:
df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,NAME,geometry
0,6.1946,27.0,5.861486,0.967342,2363.0,2.661036,34.27,-118.27,Los Angeles,POINT (-13165756.176 4065114.305)
1,4.9833,32.0,6.461078,1.239521,486.0,2.910180,34.26,-118.28,Los Angeles,POINT (-13166869.371 4063767.332)
2,4.0080,33.0,5.191176,1.165033,1845.0,3.014706,34.26,-118.29,Los Angeles,POINT (-13167982.566 4063767.332)
3,3.5568,37.0,4.928447,1.104712,1619.0,2.825480,34.26,-118.30,Los Angeles,POINT (-13169095.761 4063767.332)
4,4.8100,42.0,4.979499,0.963554,1145.0,2.608200,34.26,-118.30,Los Angeles,POINT (-13169095.761 4063767.332)
...,...,...,...,...,...,...,...,...,...,...
5819,5.4011,15.0,5.385744,1.702306,785.0,1.645702,33.75,-118.11,Los Angeles,POINT (-13147945.058 3995282.330)
5820,5.0926,24.0,5.121019,1.000000,592.0,1.885350,33.75,-118.11,Los Angeles,POINT (-13147945.058 3995282.330)
5821,3.3155,21.0,4.503788,1.064394,577.0,2.185606,33.81,-118.08,Los Angeles,POINT (-13144605.473 4003318.112)
5822,10.1597,16.0,7.606936,1.121387,450.0,2.601156,34.14,-118.83,Los Angeles,POINT (-13228095.091 4047616.133)


In [189]:
from shapely.geometry import Point
LA_point = gpd.GeoSeries([Point(-118.266667, 34.033333)] * len(df), crs=4326).to_crs("EPSG:3857")

In [190]:
gdf1 = gpd.GeoSeries(df['geometry'], crs='EPSG:3857')

In [191]:
dist = gdf1.distance(LA_point)

In [195]:
dist.mean()

24786.94145088475

https://python-visualization.github.io/folium/latest/getting_started.html