In [2]:
import pandas as pd
import geopandas as gpd
import requests
import re
import time

In [None]:
# import the ams-merged shp
gdf = gpd.read_file('ams_roads.shp')
gdf.head()

#### Fix the column road to make it better to genderise the road names

In [3]:
# Define a regex pattern that ensures the street type is not already separated
STREET_TYPES_PATTERN = re.compile(r'(?<!\s)(straat|brug|singel|burg|hof|plein|gracht|laan|weg|boulevard|steeg|gouw|post|pad|park|baan|plantsoen|toren|berg|veld|dreef|dorp|dijk|kade)\b', re.IGNORECASE)

def add_space(road_name):
    if isinstance(road_name, str):
        return STREET_TYPES_PATTERN.sub(r' \1', road_name)
    return road_name

In [4]:
gdf['road_name'] =gdf['road_name'].apply(add_space)

In [None]:
gdf.to_file('ams_roads.shp')

#### Extract the road names to csv

In [6]:
column_name = 'road_name'
df=gdf[[column_name]]
output_csv_path = "road_names1.csv"
df.to_csv(output_csv_path, index=False)

print(f"Column '{column_name}' saved to {output_csv_path}")

Column 'road_name' saved to road_names1.csv


In [6]:
df= pd.read_csv('road_names1.csv')
df

Unnamed: 0,road_name
0,'S-Gravelandse Veer
1,A. Moen straat
2,Akkerwinde weg
3,Akoleien straat
4,Alexander straat
...,...
5464,Zwarte gouw
5465,Zwartehand steeg
5466,Zwartlaken steeg
5467,Zwenkgras straat


#### Run api.genderise

In [None]:
url_template = "https://api.genderapi.io/api/?name={name}&country=NL&key=<your_API>"

cache = {}

def get_gender(name):
    if name in cache:
        return cache[name]
    
    url = url_template.format(name=name)
    
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raises an error for bad responses (4xx, 5xx)
        data = response.json()
        
        gender = data.get('gender', 'unknown')
        cache[name] = gender
        time.sleep(0.2)  # Prevents hitting API rate limits
        return gender
    
    except requests.exceptions.RequestException as e:
        print(f"API request failed: {e}")
        return "unknown"

# Assuming df['road_name'] contains full names, extracting first names
df['gender'] = df['road_name'].apply(lambda x: get_gender(x.split()[0]) if isinstance(x, str) else "unknown")


In [None]:
df[df['gender'].isna()]

In [52]:
df.to_csv('df.csv')

In [None]:
# Lets look at the null values
data= pd.read_csv('df.csv')
data[data['gender'].isna()]

Unnamed: 0.1,Unnamed: 0,road_name,gender
0,0,'S-Gravelandse Veer,
1,1,A. Moen straat,
2,2,Akkerwinde weg,
3,3,Akoleien straat,
9,9,Anielewicz singel,
...,...,...,...
5462,5462,Zwanenburgwal,
5465,5465,Zwartehand steeg,
5466,5466,Zwartlaken steeg,
5467,5467,Zwenkgras straat,


In [None]:
# This from the website directly
roadapi = pd.read_csv('road_GenderAPI.csv')
roadapi[roadapi['gender'].isna()]

Unnamed: 0,road_name,gender,country,probability,found_name
3,Akoleien straat,,,,
10,Ankerplaats,,,,
18,Avenhorn straat,,,,
19,Avercamp straat,,,,
20,Avogadro straat,,,,
...,...,...,...,...,...
5460,Zwaluw straat,,,,
5462,Zwanenburgwal,,,,
5465,Zwartehand steeg,,,,
5466,Zwartlaken steeg,,,,


In [8]:
# Merge on road names (assuming 'road_name' is the common column)
merged_df= roadapi.merge(data[['road_name','gender']], how= 'left',on='road_name',suffixes=('','_from_other'))


In [9]:
# Fill missing gender values in df1 with those from df2
merged_df['gender'] = merged_df['gender'].fillna(merged_df['gender_from_other'])


In [10]:
merged_df[merged_df['gender'].isna()]

Unnamed: 0,road_name,gender,country,probability,found_name,gender_from_other
3,Akoleien straat,,,,,
10,Ankerplaats,,,,,
18,Avenhorn straat,,,,,
19,Avercamp straat,,,,,
20,Avogadro straat,,,,,
...,...,...,...,...,...,...
5460,Zwaluw straat,,,,,
5462,Zwanenburgwal,,,,,
5465,Zwartehand steeg,,,,,
5466,Zwartlaken steeg,,,,,


In [11]:
# Drop the extra column from data
merged_df.drop(columns=['gender_from_other'], inplace=True)

In [12]:
# Check for null values
merged_df[merged_df['gender'].isnull()]

Unnamed: 0,road_name,gender,country,probability,found_name
3,Akoleien straat,,,,
10,Ankerplaats,,,,
18,Avenhorn straat,,,,
19,Avercamp straat,,,,
20,Avogadro straat,,,,
...,...,...,...,...,...
5460,Zwaluw straat,,,,
5462,Zwanenburgwal,,,,
5465,Zwartehand steeg,,,,
5466,Zwartlaken steeg,,,,


In [13]:
# Drop unnecessary columns
merged_df.drop(columns=['country','probability','found_name'], inplace=True)


In [14]:
# Fill in the missing values with unknown
merged_df=merged_df.fillna(value='unknown')
merged_df

Unnamed: 0,road_name,gender
0,'S-Gravelandse Veer,male
1,A. Moen straat,male
2,Akkerwinde weg,male
3,Akoleien straat,unknown
4,Alexander straat,male
...,...,...
5464,Zwarte gouw,female
5465,Zwartehand steeg,unknown
5466,Zwartlaken steeg,unknown
5467,Zwenkgras straat,unknown


## Categorize the roads name after objects

#### We will look at the 'unknown' attribute of gender and try classify it according to plant, object, place or other

In [None]:
# open the cleaned data
df= pd.read_csv('/8%_project/gender_data/other.csv')
df

Unnamed: 0.1,Unnamed: 0,road_name,gender
0,0,Akoleien straat,Unknown
1,1,Ankerplaats,Unknown
2,2,Avenhorn straat,Unknown
3,3,Avercamp straat,Unknown
4,4,Avogadro straat,Unknown
...,...,...,...
1591,1591,Zwaluw straat,Unknown
1592,1592,Zwanenburgwal,Unknown
1593,1593,Zwartehand steeg,Unknown
1594,1594,Zwartlaken steeg,Unknown


In [1]:
def classify_street(name):
    plant_keywords = ["roos", "tulp", "anjer", "goudsbloem", "lel", "linde", "eik", "kastanje", "wilg", "populier", "ceder", "boom", "bloem", "plant", "gras", "bos", "heide", "palm"]
    object_keywords = ["gracht", "sloot", "spui", "dam", "kade", "sluis", "brug", "toren", "molen", "huis", "haven", "kerk", "laan", "klok", "poort", "veld", "schans", "dijk"]
    place_keywords = ["park", "dorp", "polder", "bergen", "zee", "oever", "eiland", "muider", "oost", "west", "zuid", "noord"]

    if any(keyword in name for keyword in plant_keywords):
        return "Plant"
    elif any(keyword in name for keyword in object_keywords):
        return "Object"
    elif any(keyword in name for keyword in place_keywords):
        return "Place"
    else:
        return "other"

In [None]:
# Create new column with the new information
df['newclass']=df['road_name'].apply(classify_street)

In [None]:
df= df.rename(columns={'newclass':'classification'})

In [None]:
#save the cleaned data
df.to_csv('/8%_project/gender_data/other.csv')

In [None]:
df2=pd.read_csv('/8%_project/gender_data/updated_data.csv')
df2


Unnamed: 0.1,Unnamed: 0,road_name,gender
0,0,'S-Gravelandse Veer,male
1,1,A. Moen straat,male
2,2,Akkerwinde weg,male
3,3,Akoleien straat,unknown
4,4,Alexander straat,male
...,...,...,...
5464,5464,Zwarte gouw,female
5465,5465,Zwartehand steeg,unknown
5466,5466,Zwartlaken steeg,unknown
5467,5467,Zwenkgras straat,unknown


In [None]:
# merge the datasets 
df_merge= pd.merge(df2,df, how= 'left',on='road_name')


In [None]:
# Replace null values with human
df_merged=df_merged.fillna(value='human')

In [None]:
df_merged=df_merged.drop(columns=['Unnamed: 0'])

In [None]:
df_merged.to_csv('classified.csv')

### Classify the merged geom amsterdam roads shp

In [None]:
merged_geom=gpd.read_file('/8%_project/gender_data/updated_ams_roads.shp')
merged_geom

Unnamed: 0,osm_id,road_name,gender,geometry
0,7371300.0,'S-Gravelandse Veer,male,"LINESTRING (4.8971 52.36776, 4.89697 52.36774,..."
1,243520855.0,A. Moen straat,male,"LINESTRING (4.93199 52.40749, 4.93309 52.40695..."
2,7375671.0,Akkerwinde weg,male,"MULTILINESTRING ((4.91259 52.39835, 4.91252 52..."
3,312679216.0,Akoleien straat,unknown,"LINESTRING (4.87804 52.3734, 4.87841 52.37285,..."
4,7046531.0,Alexander straat,male,"LINESTRING (4.9202 52.36419, 4.92022 52.36417,..."
...,...,...,...,...
5441,5039068.0,Zwarte gouw,female,"MULTILINESTRING ((4.96713 52.3967, 4.96704 52...."
5442,54732328.0,Zwartehand steeg,unknown,"MULTILINESTRING ((4.89162 52.37445, 4.89166 52..."
5443,124933460.0,Zwartlaken steeg,unknown,"LINESTRING (4.8974 52.37334, 4.89691 52.37348)"
5444,620159214.0,Zwenkgras straat,unknown,"LINESTRING (5.00088 52.34707, 5.00075 52.34694..."


In [None]:
df = pd.read_csv('/8%_project/gender_data/classified.csv')

In [8]:
# Merge the shp with the classifed roads
classified_merged_geom= merged_geom.merge(df[['road_name','classification']], how= 'inner',on='road_name')
classified_merged_geom

Unnamed: 0,osm_id,road_name,gender,geometry,classification
0,7371300.0,'S-Gravelandse Veer,male,"LINESTRING (4.8971 52.36776, 4.89697 52.36774,...",human
1,243520855.0,A. Moen straat,male,"LINESTRING (4.93199 52.40749, 4.93309 52.40695...",human
2,7375671.0,Akkerwinde weg,male,"MULTILINESTRING ((4.91259 52.39835, 4.91252 52...",human
3,312679216.0,Akoleien straat,unknown,"LINESTRING (4.87804 52.3734, 4.87841 52.37285,...",other
4,7046531.0,Alexander straat,male,"LINESTRING (4.9202 52.36419, 4.92022 52.36417,...",human
...,...,...,...,...,...
5439,5039068.0,Zwarte gouw,female,"MULTILINESTRING ((4.96713 52.3967, 4.96704 52....",other
5440,54732328.0,Zwartehand steeg,unknown,"MULTILINESTRING ((4.89162 52.37445, 4.89166 52...",other
5441,124933460.0,Zwartlaken steeg,unknown,"LINESTRING (4.8974 52.37334, 4.89691 52.37348)",other
5442,620159214.0,Zwenkgras straat,unknown,"LINESTRING (5.00088 52.34707, 5.00075 52.34694...",Plant


In [None]:
# Save the classified shp
classified_merged_geom.to_file('/8%_project/gender_data/classified.shp')