# Proto-persona reweighting process

This notebook is shorter version of a clustering approach for persona development as described here: https://github.com/TjarkGall/proto-persona-clustering

## Notebook set up

In [2]:
# Load packages (if you get errors, you can install them, for example, by using typing:
# conda install [package name] 
# pip install [package name] 
# in the terminal. 

import os
import random
import numpy as np
import pandas as pd
import geopandas as gpd
from geopandas.tools import sjoin
from shapely import wkt


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [3]:
# Activate autosaving every 180 seconds to ensure that no data is lost
%autosave 180

Autosaving every 180 seconds


In [4]:
# Move working directory one level up to be in overall folder
# ATTENTION: If this is run more than once, it always moves on level up
os.chdir("..")

# Print working directory. It should be the folder in which there is the data folder
os.getcwd()

'/Users/tjark/Documents/Python/CairoPopulation.nosync/tfc-git'

# Step 1: Data loading and preparation

#### 2022 population

In [6]:
df_pop2022 = pd.read_csv('data/interim/activitychains/population+home-act_100perc.csv')

In [7]:
df_pop2022 = df_pop2022.drop(labels=['Unnamed: 0'], axis=1)

In [8]:
df_pop2022.head(3)

Unnamed: 0,person_id,age,gender,home_loc,car,activities,act_no,act_id,activity,distance,start,end
0,0,>75,female,POINT (31.331430700000002 29.845431899999987),1,3,0,0.0,home,0.0,-inf,575.0
1,0,>75,female,POINT (31.331430700000002 29.845431899999987),1,3,1,0.1,personal,5.902421,589.0,1132.0
2,0,>75,female,POINT (31.331430700000002 29.845431899999987),1,3,2,0.2,home,5.902421,1146.0,inf


#### 2022 population only keeping one row per person

In [8]:
df_pop2022_ind = df_pop2022[df_pop2022['act_no'] == 0]

In [9]:
df = df_pop2022_ind.drop(labels=['age','gender','car','activities','act_no','act_id','activity',
                            'distance', 'start', 'end'], axis=1)

In [10]:
# Parse the geometry objects using wkt.loads
df['geometry'] = df['home_loc'].apply(wkt.loads)

In [11]:
gdf = gpd.GeoDataFrame(df, geometry='geometry')

#### Areas including population density data

In [9]:
areas = gpd.read_file('data/raw/eg_admin_boundaries/tfc_adm2_bounds_gcr.geojson')

In [10]:
areas['pop2022'] = areas['dens2022-scaled']*areas['area_hect']
areas['pop2030-densif'] = areas['dens2030-densif']*areas['area_hect']
areas['pop2030-sprawl'] = areas['dens2030-sprawl']*areas['area_hect']

In [11]:
areas = areas.drop(labels=['gov_name','sec_name_a', 'name', 'density', 'area_hect', 
                           'population','pop_percent','dens2022-scaled','dens2030-densif',
                          'dens2030-sprawl'], axis=1)

# Step 2: Without car ownership rate changes

#### Join areas and individuals

In [15]:
join_result = sjoin(gdf, areas, how='left', op='within')

  if await self.run_code(code, result, async_=asy):
Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: None
Right CRS: EPSG:4326

  join_result = sjoin(gdf, areas, how='left', op='within')


In [26]:
join_result = join_result.drop(labels=['home_loc','geometry','index_right'], axis=1)

## Population scaling
This section scales the population df_j by its scaling factor resulting from the difference in population between the 2022 and 2030 population.

In [27]:
# Create copy of joining resutl
df_j = join_result.copy()
# Assign occ 1 to base value
df_j['occ'] = 1
# Define gids as array of all unique ids
gids = join_result['gid'].unique()

### Scaling for 2030 population with densification scenario

In [251]:
# Define empty df
df_scaled = pd.DataFrame()
# Iterate through each area
for g in gids:
    # Filter df to only include values within area h
    filtered_df = df_j[df_j['gid'] == g]
    # Calculate scaling factor base by dividing 2030 population with 2022 population for area g
    scaling_factor = (areas.loc[areas['gid'] == g, 'pop2030-densif'].item()) / areas.loc[areas['gid'] == g, 'pop2022'].item()
    # Temporary scoring scaling factor to assign it to last scaling iterations
    temp = int(scaling_factor)
    # Empty df for each iterations
    random_sample = pd.DataFrame()
    # Empty df for each iterations
    random_sample_temp = pd.DataFrame()
    # Loop through scaling if and while scaling factor is over 2
    while scaling_factor > 2:
        # Define sample at 100%
        sample_size = len(filtered_df)
        # Sample 100% without replacement
        random_sample_temp = filtered_df.sample(n=sample_size, replace=False, random_state=42)
        # Name occurrence depending on while iteration
        random_sample_temp['occ'] = int(scaling_factor)
        # Concat dfs
        random_sample = pd.concat([random_sample, random_sample_temp], ignore_index=True)
        # Decrease scaling factor by one for next iteration 
        scaling_factor -= 1
    # Set sample size based on remaining scaling factor -1 of filtered_df length
    sample_size = round(len(filtered_df)*(scaling_factor-1))
    # Sample without replacement
    random_sample_temp = filtered_df.sample(n=sample_size, replace=False, random_state=42)
    # Assign new id to occ number based on prior defined temp variable
    random_sample_temp['occ'] = temp+1
    # Concat original df_scaled df, the while df, and the leftover sample df 
    df_scaled = pd.concat([df_scaled, random_sample, random_sample_temp], ignore_index=True)
    
# Concat the original df with the scaled df
df_scaled = pd.concat([df_j, df_scaled], ignore_index=True)

In [253]:
df_factors = df_scaled.groupby('person_id').max()

### Attach scaling factor to population

In [293]:
df_pop_factors = df_pop2022.merge(df_factors, on='person_id', how='left')

In [302]:
# Duplicate each row by the factor in 'occ' column
duplicated_df = df_pop_factors.loc[df_pop_factors.index.repeat(df_pop_factors['occ'])].reset_index(drop=True)

# Add a column to indicate the number of duplicates
duplicated_df['no'] = duplicated_df.groupby(['person_id','act_no']).cumcount() + 1

# Convert 'person_id' and 'no' columns to strings
duplicated_df['person_id'] = duplicated_df['person_id'].astype(str)
duplicated_df['no'] = duplicated_df['no'].astype(str)

# Concatenate 'person_id' and 'no' columns
duplicated_df['person_id'] = duplicated_df['person_id'] + '.' + duplicated_df['no']

In [318]:
pop2030_densif = duplicated_df.drop(labels=['gid', 'occ', 'no'], axis=1)

In [319]:
pop2030_densif.head(3)

Unnamed: 0,person_id,age,gender,home_loc,car,activities,act_no,act_id,activity,distance,start,end
0,0.1,>75,female,POINT (31.331430700000002 29.845431899999987),1,3,0,0.0,home,0.0,-inf,575.0
1,0.1,>75,female,POINT (31.331430700000002 29.845431899999987),1,3,1,0.1,personal,5.902421,589.0,1132.0
2,0.1,>75,female,POINT (31.331430700000002 29.845431899999987),1,3,2,0.2,home,5.902421,1146.0,inf


In [320]:
# Export 
pop2030_densif.to_csv('data/interim/activitychains/pop2030_densif_100perc.csv', index=False)

### Scaling for 2030 population with sprawl scenario

In [29]:
# Define empty df
df_scaled = pd.DataFrame()
under1 = []
# Iterate through each area
for g in gids:
    # Filter df to only include values within area h
    filtered_df = df_j[df_j['gid'] == g]
    # Calculate scaling factor base by dividing 2030 population with 2022 population for area g
    scaling_factor = (areas.loc[areas['gid'] == g, 'pop2030-sprawl'].item()) / areas.loc[areas['gid'] == g, 'pop2022'].item()
    # Temporary scoring scaling factor to assign it to last scaling iterations
    temp = int(scaling_factor)
    # Empty df for each iterations
    random_sample = pd.DataFrame()
    # Empty df for each iterations
    random_sample_temp = pd.DataFrame()
    if scaling_factor > 1:
        # Loop through scaling if and while scaling factor is over 2
        while scaling_factor > 2:
            # Define sample at 100%
            sample_size = len(filtered_df)
            # Sample 100% without replacement
            random_sample_temp = filtered_df.sample(n=sample_size, replace=False, random_state=42)
            # Name occurrence depending on while iteration
            random_sample_temp['occ'] = int(scaling_factor)
            # Concat dfs
            random_sample = pd.concat([random_sample, random_sample_temp], ignore_index=True)
            # Decrease scaling factor by one for next iteration 
            scaling_factor -= 1
        # Set sample size based on remaining scaling factor -1 of filtered_df length
        sample_size = round(len(filtered_df)*(scaling_factor-1))
        # Sample without replacement
        random_sample_temp = filtered_df.sample(n=sample_size, replace=False, random_state=42)
        # Assign new id to occ number based on prior defined temp variable
        random_sample_temp['occ'] = temp+1
        # Concat original df_scaled df, the while df, and the leftover sample df 
        df_scaled = pd.concat([df_scaled, random_sample, random_sample_temp], ignore_index=True)
    else: 
         # Set sample size based on remaining scaling factor -1 of filtered_df length
        sample_size = round(len(filtered_df)*(scaling_factor))
        # Sample without replacement
        random_sample_temp = filtered_df.sample(n=sample_size, replace=False, random_state=42)
        # Assign new id to occ number based on prior defined temp variable
        random_sample_temp['occ'] = temp+1
        # Concat original df_scaled df, the while df, and the leftover sample df 
        df_scaled = pd.concat([df_scaled, random_sample, random_sample_temp], ignore_index=True)
        under1.append(g)

# Create a mask to filter the DataFrame
mask = df_j['gid'].apply(lambda x: x in under1)
# Apply the mask to create a new DataFrame
df_j_temp = df_j[~(mask)]

# Concat the original df with the scaled df
df_scaled = pd.concat([df_j_temp, df_scaled], ignore_index=True)

In [30]:
df_factors = df_scaled.groupby('person_id').max()

### Attach scaling factor to population

In [31]:
df_pop_factors = df_pop2022.merge(df_factors, on='person_id', how='inner')

In [32]:
# Duplicate each row by the factor in 'occ' column
duplicated_df = df_pop_factors.loc[df_pop_factors.index.repeat(df_pop_factors['occ'])].reset_index(drop=True)

# Add a column to indicate the number of duplicates
duplicated_df['no'] = duplicated_df.groupby(['person_id','act_no']).cumcount() + 1

# Convert 'person_id' and 'no' columns to strings
duplicated_df['person_id'] = duplicated_df['person_id'].astype(str)
duplicated_df['no'] = duplicated_df['no'].astype(str)

# Concatenate 'person_id' and 'no' columns
duplicated_df['person_id'] = duplicated_df['person_id'] + '.' + duplicated_df['no']

In [33]:
pop2030_sprawl = duplicated_df.drop(labels=['gid', 'occ', 'no'], axis=1)

In [34]:
pop2030_sprawl.head(3)

Unnamed: 0,person_id,age,gender,home_loc,car,activities,act_no,act_id,activity,distance,start,end,pop2022,pop2030-densif,pop2030-sprawl
0,0.1,>75,female,POINT (31.331430700000002 29.845431899999987),1,3,0,0.0,home,0.0,-inf,575.0,682752,731584,716408
1,0.1,>75,female,POINT (31.331430700000002 29.845431899999987),1,3,1,0.1,personal,5.902421,589.0,1132.0,682752,731584,716408
2,0.1,>75,female,POINT (31.331430700000002 29.845431899999987),1,3,2,0.2,home,5.902421,1146.0,inf,682752,731584,716408


In [38]:
# Export 
pop2030_sprawl.to_csv('data/interim/activitychains/pop2030_sprawl_100perc.csv', index=False)

At this stage, two of the four scenarios are prepared. Missing are the two with changes in the car ownership

# Step 3: With changing car ownership rates

The idea is to replicate the above process but add a step before which scales the individuals with cars proportionally to their occurrence before repeating the other steps.

In [13]:
# Set target percentage of car ownership
target_perc = 0.29

In [14]:
df_clustered = pd.read_csv('data/interim/clustering/person_ids_clustered.csv')

In [15]:
df = df_pop2022.merge(df_clustered, on='person_id', how='left')

In [16]:
df_grouped = df.groupby('person_id').first().reset_index()

In [17]:
current_perc = len(df_grouped[df_grouped['car'] == 1])/len(df_grouped)
print(current_perc)

0.20072404110403938


In [18]:
pop_no = len(df_grouped)
car_no = len(df_grouped[df_grouped['car'] == 1])

In [19]:
perc = current_perc
while perc < .29:
    pop_no += 1
    car_no += 1
    perc = car_no/pop_no
    
add_cars = car_no-len(df_grouped[df_grouped["car"] == 1])
print(f'Final number of people with cars is {car_no}')
print('––––––––––––––––––––––––––––––––––––––––')
print(f'This means addition {add_cars}')

Final number of people with cars is 4991292
––––––––––––––––––––––––––––––––––––––––
This means addition 1922440


In [20]:
new_car_people = df_grouped[df_grouped['car'] == 1].sample(n=add_cars, replace=False, random_state=42)

In [21]:
new_car_people_ids = new_car_people['person_id'].unique()

In [22]:
new_car_people_ids_set = set(new_car_people_ids)

In [23]:
df['person_id'] = df['person_id'].astype(int)

In [24]:
new_car_people_ids_set = {int(value) for value in new_car_people_ids_set}

In [25]:
# # Create a mask to filter the DataFrame
mask = df['person_id'].isin(new_car_people_ids_set)
print(mask.sum())

7058758


In [26]:
# Apply the mask to create a new DataFrame
df_temp = df[mask]

In [27]:
# Convert the 'person_id' column to string type
df_temp['person_id'] = df_temp['person_id'].astype(str)

# Add a suffix "b" to the 'person_id' column
df_temp['person_id'] = df_temp['person_id'] + "b"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_temp['person_id'] = df_temp['person_id'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_temp['person_id'] = df_temp['person_id'] + "b"


In [28]:
df_temp['person_id'].nunique() # should be add_cars, here 1922441

1922440

In [29]:
# Concat the original df with the car df
df_pop2022_car = pd.concat([df, df_temp], ignore_index=True)

In [31]:
df_pop2022_car.head(3)

Unnamed: 0,person_id,age,gender,home_loc,car,activities,act_no,act_id,activity,distance,start,end,CLUSTER
0,0,>75,female,POINT (31.331430700000002 29.845431899999987),1,3,0,0.0,home,0.0,-inf,575.0,5
1,0,>75,female,POINT (31.331430700000002 29.845431899999987),1,3,1,0.1,personal,5.902421,589.0,1132.0,5
2,0,>75,female,POINT (31.331430700000002 29.845431899999987),1,3,2,0.2,home,5.902421,1146.0,inf,5


## Repeat adpated previous steps for geographical scaling

#### 2022 population with car reweighting only keeping one row per person

In [32]:
df_ind = df_pop2022_car[df_pop2022_car['act_no'] == 0]

In [33]:
df = df_ind.drop(labels=['age','gender','car','activities','act_no','act_id','activity',
                            'distance', 'start', 'end'], axis=1)

In [34]:
# Parse the geometry objects using wkt.loads
df['geometry'] = df['home_loc'].apply(wkt.loads)

In [35]:
gdf = gpd.GeoDataFrame(df, geometry='geometry')

#### Join areas and individuals

In [36]:
join_result = sjoin(gdf, areas, how='left', op='within')

  if await self.run_code(code, result, async_=asy):
Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: None
Right CRS: EPSG:4326

  join_result = sjoin(gdf, areas, how='left', op='within')


In [37]:
join_result.head(3)

Unnamed: 0,person_id,home_loc,CLUSTER,geometry,index_right,gid,pop2022,pop2030-densif,pop2030-sprawl
0,0,POINT (31.331430700000002 29.845431899999987),5,POINT (31.33143 29.84543),0,2,682752,731584,716408
3,1,POINT (31.313874 29.8144589),2,POINT (31.31387 29.81446),0,2,682752,731584,716408
6,2,POINT (31.3075189 29.860689800000003),8,POINT (31.30752 29.86069),0,2,682752,731584,716408


In [38]:
join_result = join_result.drop(labels=['home_loc','geometry','index_right'], axis=1)

## Population scaling
This section scales the population df_j by its scaling factor resulting from the difference in population between the 2022 and 2030 population.

In [39]:
# Create copy of joining resutl
df_j = join_result.copy()
# Assign occ 1 to base value
df_j['occ'] = 1
# Define gids as array of all unique ids
gids = join_result['gid'].unique()

### Scaling for 2030 population with densification scenario

In [40]:
df_j_grouped = df_j.groupby('gid').count()

In [41]:
df_j_grouped = df_j_grouped.drop(labels=['person_id','CLUSTER','pop2030-densif',
                                         'pop2030-sprawl', 'occ'], axis=1)

In [42]:
df_j_grouped = df_j_grouped.reset_index()

In [131]:
# Define empty df
df_scaled = pd.DataFrame()
under1 = []
# Iterate through each area
for g in gids:
   # Filter df to only include values within area h
    filtered_df = df_j[df_j['gid'] == g]
    # Calculate scaling factor base by dividing 2030 population with 2022 population for area g
    scaling_factor = (areas.loc[areas['gid'] == g, 'pop2030-densif'].item()) / (df_j_grouped.loc[df_j_grouped['gid'] == g, 'pop2022'].item()/0.675)
    # Temporary scoring scaling factor to assign it to last scaling iterations
    temp = int(scaling_factor)
    # Empty df for each iterations
    random_sample = pd.DataFrame()
    # Empty df for each iterations
    random_sample_temp = pd.DataFrame()
    if scaling_factor > 1:
        # Loop through scaling if and while scaling factor is over 2
        while scaling_factor > 2:
            # Define sample at 100%
            sample_size = len(filtered_df)
            # Sample 100% without replacement
            random_sample_temp = filtered_df.sample(n=sample_size, replace=False, random_state=42)
            # Name occurrence depending on while iteration
            random_sample_temp['occ'] = int(scaling_factor)
            # Concat dfs
            random_sample = pd.concat([random_sample, random_sample_temp], ignore_index=True)
            # Decrease scaling factor by one for next iteration 
            scaling_factor -= 1
        # Set sample size based on remaining scaling factor -1 of filtered_df length
        sample_size = round(len(filtered_df)*(scaling_factor-1))
        # Sample without replacement
        random_sample_temp = filtered_df.sample(n=sample_size, replace=False, random_state=42)
        # Assign new id to occ number based on prior defined temp variable
        random_sample_temp['occ'] = temp+1
        # Concat original df_scaled df, the while df, and the leftover sample df 
        df_scaled = pd.concat([df_scaled, random_sample, random_sample_temp], ignore_index=True)
    else: 
        # Set sample size based on remaining scaling factor -1 of filtered_df length
        sample_size = round(len(filtered_df)*(scaling_factor))
        # Sample without replacement
        random_sample_temp = filtered_df.sample(n=sample_size, replace=False, random_state=42)
        # Assign new id to occ number based on prior defined temp variable
        random_sample_temp['occ'] = temp+1
        # Concat original df_scaled df, the while df, and the leftover sample df 
        df_scaled = pd.concat([df_scaled, random_sample, random_sample_temp], ignore_index=True)
        under1.append(g)

# Create a mask to filter the DataFrame
mask = df_j['gid'].apply(lambda x: x in under1)
# Apply the mask to create a new DataFrame
df_j_temp = df_j[~(mask)]

# Concat the original df with the scaled df
df_scaled = pd.concat([df_j_temp, df_scaled], ignore_index=True)

In [132]:
len(df_scaled)

17188337

In [133]:
df_factors = df_scaled.groupby('person_id').max()

### Attach scaling factor to population

In [134]:
df_pop2022_car['person_id'].nunique()

17211351

In [135]:
df_factors = df_factors.reset_index()

In [137]:
df_pop_factors = df_pop2022_car.merge(df_factors, on='person_id', how='inner')

In [138]:
# Duplicate each row by the factor in 'occ' column
duplicated_df = df_pop_factors.loc[df_pop_factors.index.repeat(df_pop_factors['occ'])].reset_index(drop=True)

In [140]:
# Add a column to indicate the number of duplicates
duplicated_df['no'] = duplicated_df.groupby(['person_id','act_no']).cumcount() + 1

# Convert 'person_id' and 'no' columns to strings
duplicated_df['person_id'] = duplicated_df['person_id'].astype(str)
duplicated_df['no'] = duplicated_df['no'].astype(str)

# Concatenate 'person_id' and 'no' columns
duplicated_df['person_id'] = duplicated_df['person_id'] + '.' + duplicated_df['no']

In [141]:
pop2030_densif_car = duplicated_df.drop(labels=['CLUSTER_y', 'gid', 'pop2022', 
                                                'pop2030-densif', 'pop2030-sprawl',
                                                'occ', 'no'], axis=1)
pop2030_densif_car = pop2030_densif_car.rename(columns={'CLUSTER_x': 'CLUSTER'})

In [142]:
pop2030_densif_car_grouped = pop2030_densif_car.groupby('person_id').first()

In [143]:
print(len(pop2030_densif_car_grouped))
print(pop2030_densif_car_grouped['CLUSTER'].value_counts())

17188337
9    3934007
8    2694784
6    2524357
5    1949368
7    1448380
1    1441868
3    1306511
2    1176338
0     708332
4       4392
Name: CLUSTER, dtype: int64


In [144]:
pop2030_densif_car = pop2030_densif_car.drop(labels=['CLUSTER'], axis=1)

In [145]:
pop2030_densif_car.head(3)

Unnamed: 0,person_id,age,gender,home_loc,car,activities,act_no,act_id,activity,distance,start,end
0,0.1,>75,female,POINT (31.331430700000002 29.845431899999987),1,3,0,0.0,home,0.0,-inf,575.0
1,0.1,>75,female,POINT (31.331430700000002 29.845431899999987),1,3,1,0.1,personal,5.902421,589.0,1132.0
2,0.1,>75,female,POINT (31.331430700000002 29.845431899999987),1,3,2,0.2,home,5.902421,1146.0,inf


In [146]:
pop2030_densif_car['person_id'].nunique()

17188337

In [147]:
# Export 
pop2030_densif_car.to_csv('data/interim/activitychains/pop2030_densif_car_100perc.csv', index=False)

### Repeating steps for sprawl scenario with car ownership rate change

In [148]:
# Define empty df
df_scaled = pd.DataFrame()
under1 = []
# Iterate through each area
for g in gids:
   # Filter df to only include values within area h
    filtered_df = df_j[df_j['gid'] == g]
    # Calculate scaling factor base by dividing 2030 population with 2022 population for area g
    scaling_factor = (areas.loc[areas['gid'] == g, 'pop2030-sprawl'].item()) / (df_j_grouped.loc[df_j_grouped['gid'] == g, 'pop2022'].item()/0.675)
    # Temporary scoring scaling factor to assign it to last scaling iterations
    temp = int(scaling_factor)
    # Empty df for each iterations
    random_sample = pd.DataFrame()
    # Empty df for each iterations
    random_sample_temp = pd.DataFrame()
    if scaling_factor > 1:
        # Loop through scaling if and while scaling factor is over 2
        while scaling_factor > 2:
            # Define sample at 100%
            sample_size = len(filtered_df)
            # Sample 100% without replacement
            random_sample_temp = filtered_df.sample(n=sample_size, replace=False, random_state=42)
            # Name occurrence depending on while iteration
            random_sample_temp['occ'] = int(scaling_factor)
            # Concat dfs
            random_sample = pd.concat([random_sample, random_sample_temp], ignore_index=True)
            # Decrease scaling factor by one for next iteration 
            scaling_factor -= 1
        # Set sample size based on remaining scaling factor -1 of filtered_df length
        sample_size = round(len(filtered_df)*(scaling_factor-1))
        # Sample without replacement
        random_sample_temp = filtered_df.sample(n=sample_size, replace=False, random_state=42)
        # Assign new id to occ number based on prior defined temp variable
        random_sample_temp['occ'] = temp+1
        # Concat original df_scaled df, the while df, and the leftover sample df 
        df_scaled = pd.concat([df_scaled, random_sample, random_sample_temp], ignore_index=True)
    else: 
        # Set sample size based on remaining scaling factor -1 of filtered_df length
        sample_size = round(len(filtered_df)*(scaling_factor))
        # Sample without replacement
        random_sample_temp = filtered_df.sample(n=sample_size, replace=False, random_state=42)
        # Assign new id to occ number based on prior defined temp variable
        random_sample_temp['occ'] = temp+1
        # Concat original df_scaled df, the while df, and the leftover sample df 
        df_scaled = pd.concat([df_scaled, random_sample, random_sample_temp], ignore_index=True)
        under1.append(g)

# Create a mask to filter the DataFrame
mask = df_j['gid'].apply(lambda x: x in under1)
# Apply the mask to create a new DataFrame
df_j_temp = df_j[~(mask)]

# Concat the original df with the scaled df
df_scaled = pd.concat([df_j_temp, df_scaled], ignore_index=True)

In [149]:
len(df_scaled)

17189369

In [150]:
df_factors = df_scaled.groupby('person_id').max()

### Attach scaling factor to population

In [151]:
df_pop2022_car['person_id'].nunique()

17211351

In [152]:
df_factors = df_factors.reset_index()

In [153]:
df_pop_factors = df_pop2022_car.merge(df_factors, on='person_id', how='inner')

In [155]:
# Duplicate each row by the factor in 'occ' column
duplicated_df = df_pop_factors.loc[df_pop_factors.index.repeat(df_pop_factors['occ'])].reset_index(drop=True)

In [156]:
# Add a column to indicate the number of duplicates
duplicated_df['no'] = duplicated_df.groupby(['person_id','act_no']).cumcount() + 1

# Convert 'person_id' and 'no' columns to strings
duplicated_df['person_id'] = duplicated_df['person_id'].astype(str)
duplicated_df['no'] = duplicated_df['no'].astype(str)

# Concatenate 'person_id' and 'no' columns
duplicated_df['person_id'] = duplicated_df['person_id'] + '.' + duplicated_df['no']

In [157]:
pop2030_sprawl_car = duplicated_df.drop(labels=['CLUSTER_y', 'gid', 'pop2022', 
                                                'pop2030-densif', 'pop2030-sprawl',
                                                'occ', 'no'], axis=1)
pop2030_sprawl_car = pop2030_sprawl_car.rename(columns={'CLUSTER_x': 'CLUSTER'})

In [158]:
pop2030_sprawl_car_grouped = pop2030_sprawl_car.groupby('person_id').first()

In [159]:
print(len(pop2030_sprawl_car_grouped))
print(pop2030_sprawl_car_grouped['CLUSTER'].value_counts())

17189369
9    3941460
8    2700034
6    2523523
5    1942833
7    1444597
1    1443080
3    1298339
2    1179395
0     711735
4       4373
Name: CLUSTER, dtype: int64


In [160]:
pop2030_sprawl_car = pop2030_sprawl_car.drop(labels=['CLUSTER'], axis=1)

In [162]:
pop2030_sprawl_car.head(3)

Unnamed: 0,person_id,age,gender,home_loc,car,activities,act_no,act_id,activity,distance,start,end
0,0.1,>75,female,POINT (31.331430700000002 29.845431899999987),1,3,0,0.0,home,0.0,-inf,575.0
1,0.1,>75,female,POINT (31.331430700000002 29.845431899999987),1,3,1,0.1,personal,5.902421,589.0,1132.0
2,0.1,>75,female,POINT (31.331430700000002 29.845431899999987),1,3,2,0.2,home,5.902421,1146.0,inf


In [161]:
pop2030_sprawl_car['person_id'].nunique()

17189369

In [163]:
# Export 
pop2030_sprawl_car.to_csv('data/interim/activitychains/pop2030_sprawl_car_100perc.csv', index=False)