# Create populations samples outside of MATSim

### Import packages and data

In [2]:
import pandas as pd
import numpy as np
import geopandas as gpd
import gzip
import shutil


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


## Population part

In [35]:
df_raw = pd.read_csv('/Users/tjark/Documents/Python/CairoPopulation.nosync/tfc-git/data/interim/activitychains/population+home-act_100perc.csv')

In [5]:
# df_raw = df_raw.drop('Unnamed: 0', axis=1)

In [36]:
df_raw.head()

Unnamed: 0.1,Unnamed: 0,person_id,age,gender,home_loc,car,activities,act_no,act_id,activity,distance,start,end
0,0,0,>75,female,POINT (31.331430700000002 29.845431899999987),1,3,0,0.0,home,0.0,-inf,575.0
1,1,0,>75,female,POINT (31.331430700000002 29.845431899999987),1,3,1,0.1,personal,5.902421,589.0,1132.0
2,2,0,>75,female,POINT (31.331430700000002 29.845431899999987),1,3,2,0.2,home,5.902421,1146.0,inf
3,3,1,26-35,female,POINT (31.313874 29.8144589),1,3,0,1.0,home,0.0,-inf,535.0
4,4,1,26-35,female,POINT (31.313874 29.8144589),1,3,1,1.1,work,14.9372,571.0,731.0


In [37]:
df_raw.head()

Unnamed: 0.1,Unnamed: 0,person_id,age,gender,home_loc,car,activities,act_no,act_id,activity,distance,start,end
0,0,0,>75,female,POINT (31.331430700000002 29.845431899999987),1,3,0,0.0,home,0.0,-inf,575.0
1,1,0,>75,female,POINT (31.331430700000002 29.845431899999987),1,3,1,0.1,personal,5.902421,589.0,1132.0
2,2,0,>75,female,POINT (31.331430700000002 29.845431899999987),1,3,2,0.2,home,5.902421,1146.0,inf
3,3,1,26-35,female,POINT (31.313874 29.8144589),1,3,0,1.0,home,0.0,-inf,535.0
4,4,1,26-35,female,POINT (31.313874 29.8144589),1,3,1,1.1,work,14.9372,571.0,731.0


### Delete entries where missing first activtiy

In [38]:
# Check where person has no first activity with 0
all_person_ids = set(df_raw['person_id'].unique())
filtered_person_ids = set(df_raw[df_raw['act_no'] < 1]['person_id'].unique())

# Entries only in df_raw['person_id'] but not in df_raw[df_raw['act_no'] < 1]['person_id']
only_in_all = all_person_ids - filtered_person_ids

In [39]:
# Create a mask for locations where person is saved without 0 activity
mask0 = df_raw['person_id'].isin(only_in_all)
# delete entries where no first activity
df_raw = df_raw[~mask0]

In [40]:
len(df_raw)

53524392

In [41]:
df_raw.head(3)

Unnamed: 0.1,Unnamed: 0,person_id,age,gender,home_loc,car,activities,act_no,act_id,activity,distance,start,end
0,0,0,>75,female,POINT (31.331430700000002 29.845431899999987),1,3,0,0.0,home,0.0,-inf,575.0
1,1,0,>75,female,POINT (31.331430700000002 29.845431899999987),1,3,1,0.1,personal,5.902421,589.0,1132.0
2,2,0,>75,female,POINT (31.331430700000002 29.845431899999987),1,3,2,0.2,home,5.902421,1146.0,inf


### Verify NAs etc.

In [42]:
if len(df_raw[(df_raw['start'].isna() & df_raw['act_no'] != 0)]) == 0:
    # Replace NaN values in 'start' column with -inf
    # df_raw['start'].fillna(-np.inf, inplace=True)
    print('all good')
else:
    print('Ensure that NAs are only at start of first trip of day.')

all good


In [43]:
if len(df_raw[(df_raw['end'].isna() & (df_raw['act_no'] == (df_raw['activities']+1)))]) == 0:
    # Replace NaN values in 'start' column with -inf
    # df_raw['end'].fillna(np.inf, inplace=True)
    print('all good')
else:
    print('Ensure that NAs are only at end of last trip of day.')

all good


### Create reproducible sample indices at 0.1, 1, and 5%

In [44]:
person_ids = df_raw['person_id'].unique()

In [45]:
frac = 0.001  # Fraction to sample
random_state = 1234  # Random state for reproducibility

# Set the random seed
np.random.seed(random_state)

# Generate random indices
num_samples = int(len(person_ids) * frac)
sample_indices = np.random.choice(len(person_ids), num_samples, replace=False)

# Select the corresponding elements from the array
person_ids_0p1perc = person_ids[sample_indices]

In [46]:
# frac = 0.01  # Fraction to sample
# random_state = 1234  # Random state for reproducibility

# # Set the random seed
# np.random.seed(random_state)

# # Generate random indices
# num_samples = int(len(person_ids) * frac)
# sample_indices = np.random.choice(len(person_ids), num_samples, replace=False)

# # Select the corresponding elements from the array
# person_ids_1perc = person_ids[sample_indices]

In [47]:
# frac = 0.05  # Fraction to sample
# random_state = 1234  # Random state for reproducibility

# # Set the random seed
# np.random.seed(random_state)

# # Generate random indices
# num_samples = int(len(person_ids) * frac)
# sample_indices = np.random.choice(len(person_ids), num_samples, replace=False)

# # Select the corresponding elements from the array
# person_ids_5perc = person_ids[sample_indices]

### Sample underlying database

In [48]:
# 0.001 sample

# Create a boolean mask to check if 'person_id' is in 'person_ids_0p001'
mask1 = df_raw['person_id'].isin(person_ids_0p1perc)

# Apply the mask to filter the DataFrame
df1 = df_raw[mask1]

In [49]:
# # 0.01 sample

# # Create a boolean mask to check if 'person_id' is in 'person_ids_0p001'
# mask2 = df_raw['person_id'].isin(person_ids_1perc)

# # Apply the mask to filter the DataFrame
# df2 = df_raw[mask2]

In [50]:
# # 0.05 sample

# # Create a boolean mask to check if 'person_id' is in 'person_ids_0p001'
# mask3 = df_raw['person_id'].isin(person_ids_5perc)

# # Apply the mask to filter the DataFrame
# df3 = df_raw[mask3]

### Working

In [51]:
# For 0.1 % 
print(f'Verification for {df1}')
print('––––––––––––––––')
print(f"Same length for 0 and 1: {len(df1[df1['act_no'] == 0]) == len(df1[df1['act_no'] == 0])}")
df_temp = df1[df1['act_no'] == 0]
df_temp = df_temp.reset_index()
df1['person_id_old'] = df1['person_id']
df_temp = df_temp[['index','person_id']]
df1 = pd.merge(df1, df_temp, on='person_id')
df1['person_id'] = df1['index']
df1 = df1.drop('index', axis=1)

Verification for           Unnamed: 0  person_id    age  gender  \
3550            3550       1395  18-25  female   
3551            3551       1395  18-25  female   
3552            3552       1395  18-25  female   
7125            7125       2764  26-35  female   
7126            7126       2764  26-35  female   
...              ...        ...    ...     ...   
53504860    53504895   20644054    <18  female   
53504861    53504896   20644054    <18  female   
53518255    53518290   20649335    <18    male   
53518256    53518291   20649335    <18    male   
53518257    53518292   20649335    <18    male   

                                               home_loc  car  activities  \
3550              POINT (31.300326400000003 29.8105904)    0           3   
3551              POINT (31.300326400000003 29.8105904)    0           3   
3552              POINT (31.300326400000003 29.8105904)    0           3   
7125      POINT (31.305176800000005 29.840169799999998)    1           3   
71

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['person_id_old'] = df1['person_id']


In [52]:
# # For 1 % 
# print(f'Verification for {df2}')
# print('––––––––––––––––')
# print(f"Same length for 0 and 1: {len(df2[df2['act_no'] == 0]) == len(df2[df2['act_no'] == 0])}")
# df_temp = df2[df2['act_no'] == 0]
# df_temp = df_temp.reset_index()
# df2['person_id_old'] = df2['person_id']
# df_temp = df_temp[['index','person_id']]
# df2 = pd.merge(df2, df_temp, on='person_id')
# df2['person_id'] = df2['index']
# df2 = df2.drop('index', axis=1)

In [53]:
# # For 5 % 
# print(f'Verification for {df3}')
# print('––––––––––––––––')
# print(f"Same length for 0 and 1: {len(df3[df3['act_no'] == 0]) == len(df3[df3['act_no'] == 0])}")
# df_temp = df3[df3['act_no'] == 0]
# df_temp = df_temp.reset_index()
# df3['person_id_old'] = df3['person_id']
# df_temp = df_temp[['index','person_id']]
# df3 = pd.merge(df3, df_temp, on='person_id')
# df3['person_id'] = df3['index']
# df3 = df3.drop('index', axis=1)

In [54]:
df1.head(3)

Unnamed: 0.1,Unnamed: 0,person_id,age,gender,home_loc,car,activities,act_no,act_id,activity,distance,start,end,person_id_old
0,3550,3550,18-25,female,POINT (31.300326400000003 29.8105904),0,3,0,1395.0,home,0.0,-inf,468.0,1395
1,3551,3550,18-25,female,POINT (31.300326400000003 29.8105904),0,3,1,1395.1,work,25.669545,530.0,1000.0,1395
2,3552,3550,18-25,female,POINT (31.300326400000003 29.8105904),0,3,2,1395.2,home,25.669545,1062.0,inf,1395


In [55]:
# df2.head(3)

In [56]:
# df3.head(3)

In [57]:
df1['start'] = df1['start'].replace(-np.inf, np.nan)
df1['end'] = df1['end'].replace(np.inf, np.nan)

In [58]:
# df2['start'] = df2['start'].replace(-np.inf, np.nan)
# df2['end'] = df2['end'].replace(np.inf, np.nan)

In [59]:
# df3['start'] = df3['start'].replace(-np.inf, np.nan)
# df3['end'] = df3['end'].replace(np.inf, np.nan)

In [2]:
df1.head(3)

In [149]:
# df1 = df1.drop(['pop2022','pop2030-densif','pop2030-sprawl'], axis=1)

### Lines below were needed for s0a 1 and 5% only

In [None]:
# df = data[(data['start'].isna()) & (data['act_no'] != 0)]['person_id']
# df = df.reset_index()
# df.drop(columns=['index'], inplace=True)
# data = data[~data['person_id'].isin(df['person_id'])]
# data.drop(columns=['Unnamed: 0'], inplace=True)

### Export files

In [67]:
df1.to_csv('/Users/tjark/Documents/MATSim/Cairo.nosync/ile-de-france/data/data_cairo/population+home-act_0p1perc.csv')

In [61]:
# df2.to_csv('/Users/tjark/Documents/MATSim/Cairo.nosync/ile-de-france/data/data_cairo/pop2030_sprawl_car_1perc.csv')

In [34]:
# df3.to_csv('/Users/tjark/Documents/MATSim/Cairo.nosync/ile-de-france/data/data_cairo/pop2030_sprawl_car_5perc.csv')

In [109]:
# df_raw.to_csv('/Users/tjark/Documents/MATSim/Cairo.nosync/ile-de-france/data/data_cairo/population+home-act_100perc.csv')