## SEN1721 2025 Group assignment 3
#### This notebook aims to get you started with the assigmnent

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from pathlib import Path

# Biogeme
import biogeme.database as db
import biogeme.biogeme as bio
from biogeme.expressions import Beta, PanelLikelihoodTrajectory

# Import custom estimation functions for Biogeme
from bio_estimation_fcns import estimate_mnl, estimate_LC, print_results, create_collage

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

In [None]:
# Set the seed for reproducibility
import random
random.seed(42)
from random import random as rand

In [None]:
# Define the paths to the data sets

# Path to the folder of the choice data set and segmentation data set
data_path = Path(os.getcwd()+'/data') # Make sure to change this to your own path
print(data_path)

# Path to the folder of the images
img_path = Path(os.getcwd()+'/data/raw_images') # Make sure to change this to your own path
print(img_path)

### 1. Load the choice data

In [None]:
df = pd.read_csv(data_path / 'SEN1721_LC_group_assignment_2025.csv')

In [None]:
df.head()

#### 1.1 Descriptive analysis of choice data

In [None]:
# Check the number of choice task per individual
print(f"There are {df['RID'].nunique()} individuals in the dataset")
print(f"The number of choice tasks per individual is:")
print(df['RID'].value_counts().value_counts())

In [None]:
df.describe()

### 2. Load the image segmentation & depth data

In [None]:
df_img = pd.read_csv(data_path / 'images_SEN1721.csv')
df_img.head()

#### 2.1 Descriptive analysis of segmentation data

In [None]:
df_img.describe()

In [None]:
# Show random sample of images given a threshold condition
N_images = 30 # Define the number of images to show
cols = 5 # Define the number of columns in the collage (cols x rows <= N_images)
rows = 5 # Define the number of rows in the collage (cols x rows <= N_images)

# Define a list of segmentation to show
cols_show = ['BUILDING','GRASS']# ,'WATER','ROAD','SKY','TREES','PERSON','TRUCK'] # Modify to show different segmentations

# Define the threshold level for the quantile
quantile_threshold_level = 0.99 # Modify this threshold level to show different images

# Loop over the list
for col in cols_show:

    # Set the threshold level
    threshold = df_img[col].quantile(quantile_threshold_level)
    
    # Filter the images based on the threshold condition
    df_cond = df_img[df_img[col] >= threshold]
    print(f'There are {len(df_cond)} images satisfying the condition {col} >= {threshold:0.3f}')

    # Check if there are enough images to show and if the number of columns and rows is less than the number of images
    if len(df_cond) > N_images and (cols * rows) <= N_images:
        
        # Randomly sample N_images from df_cond
        df_sample = df_cond.sample(N_images) # Random sample of N_images

        # Show the images in a collage
        create_collage(img_path,
                        df_sample['IMG'], 
                        txt = df_sample['IMG'],
                        cols = cols,
                        rows = rows)
    elif len(df_cond) <= N_images:
        print(f'There are not enough images to show: {len(df_cond)} < {N_images}')
    elif (cols * rows) > N_images:
        print(f'cols * rows = {cols * rows} is greater than N_images = {N_images}. Adjust the number of columns and rows') 

#### 2.2 Descriptive analysis of depth data

In [None]:
# Get the columns with the depth information
col_depth = [col for col in df_img.columns if 'DIST' in col]
col_depth

In [None]:
# Plot the distribution of the depth features
fig, ax = plt.subplots(1, 3, figsize=(12, 4), sharey=True)
cols_depth_fig = ['MIN_DISTANCE', 'MEAN_DISTANCE', 'MAX_DISTANCE']
for i, col in enumerate(cols_depth_fig):
    ax[i].hist(df_img[col], bins=50, edgecolor='black', rwidth=0.80)
    ax[i].set_ylabel('Count')
    ax[i].set_xlabel(col)
    ax[i].grid()
    ax[i].set_axisbelow(True)
plt.show()

In [None]:
# Show random sample of images given a threshold condition
N_images = 30 # Define the number of images to show
cols = 5 # Define the number of columns in the collage (cols x rows <= N_images)
rows = 5 # Define the number of rows in the collage (cols x rows <= N_images)

# Define a list of depth statitics to show
cols_depth_collage = ['MIN_MAX_DIST', 'MAX_MAX_DIST']

# Loop over the list
for col in cols_depth_collage:

    # Filter the images based on the condition
    df_cond = df_img[df_img[col] == 1]
    
    # Randomly sample N_images from df_cond
    df_sample = df_cond.sample(N_images) # Random sample of N_images

    # Show the images in a collage
    print(f'Random sample of {N_images} images from the 10% quantile of images with the {col}')
    create_collage(img_path,
                    df_sample['IMG'], 
                    txt = df_sample['IMG'],
                    cols = cols,
                    rows = rows)

### 3. Estimate discrete choice models

In [None]:
# Create Biogeme database object
# list of columns that arenot objects
cols = df.columns[df.dtypes != 'object']
biodata = db.Database('data', df.loc[:,cols])

# Create biogeme variables
for c in cols:
    globals()[c] = biodata.variables[c]

#### 3.1 Linear-additive RUM-MNL model:
Note that in the utility function, we divide the monthly cost (C) and the travel time (TT) by 225 and 15 respectively. These are the maximum values for these variables in the data. Hence they are rescaled such that they are between -1 and 1. This makes the estimation process more stable. <br>
However, it is important to remember that the estimated coefficients should be multiplied by 225 and 15 when interpreting them.

In [None]:
# Give a name to the model    
model_name = 'Linear-additive RUM-MNL, with min-max depth combinations'

# Define the model parameters, using the function "Beta()", in which you must define:
B_hhc = Beta('B_hhc', 0, None, None, 0)
B_tti = Beta('B_tti', 0, None, None, 0)
B_max_min_dist =  Beta('B_max_min_dist', 0, None, None, 0)
B_min_max_dist =  Beta('B_min_max_dist', 0, None, None, 0)
B_max_max_dist =  Beta('B_max_max_dist', 0, None, None, 0)

# Define the utility functions
V1 = B_hhc * (C1/225) + B_tti * (TT1/15) + B_max_min_dist * MAX_MIN_DIST1 + B_min_max_dist * MIN_MAX_DIST1 + B_max_max_dist * MAX_MAX_DIST1    
V2 = B_hhc * (C2/225) + B_tti * (TT2/15) + B_max_min_dist * MAX_MIN_DIST2 + B_min_max_dist * MIN_MAX_DIST2 + B_max_max_dist * MAX_MAX_DIST2

# Associate utility functions with the numbering of alternatives
V  = {1: V1, 2: V2}

# Associate the availability conditions with the alternatives
AV = {1: 1, 2: 1}

# Estimate the model
results = estimate_mnl(V, AV, CHOICE,biodata,model_name)
print_results(results)

#### 3.2 Latent class models:

In [None]:
cols = df.columns[df.dtypes != 'object']
biodata_panel = db.Database('data_panel', df.loc[:,cols])
biodata_panel.panel("RID")

# Create biogeme variables
for c in cols:
    globals()[c] = biodata_panel.variables[c]

In [None]:
# Give a name to the model    
model_name = 'LC with 2 classes'

# Define the model parameters, using the function "Beta()", in which you must define:
B_hhc_0 = Beta('B_hhc_0', -0.05, None, None, 0)
B_tti_0 = Beta('B_tti_0', -0.05, None, None, 0)
B_max_min_dist_0 =  Beta('B_max_min_dist_0', 0, None, None, 0)
B_min_max_dist_0 =  Beta('B_min_max_dist_0', 0, None, None, 0)
B_max_max_dist_0 =  Beta('B_max_max_dist_0', 0, None, None, 0)

B_hhc_1 = Beta('B_hhc_1', -0.10, None, None, 0)
B_tti_1 = Beta('B_tti_1', -0.10, None, None, 0)
B_max_min_dist_1 =  Beta('B_max_min_dist_1', 0, None, None, 0)
B_min_max_dist_1 =  Beta('B_min_max_dist_1', 0, None, None, 0)
B_max_max_dist_1 =  Beta('B_max_max_dist_1', 0, None, None, 0)

# Define the membership model parameters
delta_0 = Beta('delta_0',  0   , None, None, 1)
delta_1 = Beta('delta_1',  0.10, None, None, 0)
gamma_gender_1 = Beta('gamma_gender_1', 0, None, None, 0)

# Define utility functions for each class
V1_0 = B_hhc_0 * (C1 / 225) + B_tti_0 * (TT1 / 15) + B_max_min_dist_0 * MAX_MIN_DIST1 + B_min_max_dist_0 * MIN_MAX_DIST1 + B_max_max_dist_0 * MAX_MAX_DIST1
V2_0 = B_hhc_0 * (C2 / 225) + B_tti_0 * (TT2 / 15) + B_max_min_dist_0 * MAX_MIN_DIST2 + B_min_max_dist_0 * MIN_MAX_DIST2 + B_max_max_dist_0 * MAX_MAX_DIST2

V1_1 = B_hhc_1 * (C1 / 225) + B_tti_1 * (TT1 / 15) + B_max_min_dist_1 * MAX_MIN_DIST1 + B_min_max_dist_1 * MIN_MAX_DIST1 + B_max_max_dist_1 * MAX_MAX_DIST1
V2_1 = B_hhc_1 * (C2 / 225) + B_tti_1 * (TT2 / 15) + B_max_min_dist_1 * MAX_MIN_DIST2 + B_min_max_dist_1 * MIN_MAX_DIST2 + B_max_max_dist_1 * MAX_MAX_DIST2

# Create a dictionary associating utility functions with the numbering of alternatives in the "choice" column
V_0 = {1: V1_0, 2: V2_0}
V_1 = {1: V1_1, 2: V2_1}

# Put the dictionaries of utility functions in a list
V = [V_0, V_1]

# Create a dictionary to describe the availability conditions of each alternative, where 1 indicates that the alternative is available, and 0 indicates that the alternative is not available.
AV = {1: 1, 2: 1} 

# Membership value-functions
# The term "PanelLikelihoodTrajectory(GENDER)**(1/(PanelLikelihoodTrajectory(T)))" is a bit strange. But, it is the "trick" to include the covariate in the membership function.
# It first multiplies the covariate over all rows in the database for each individual: PanelLikelihoodTrajectory(GENDER). Then, it undoes the multiplication by taking it to the power of (1/(PanelLikelihoodTrajectory(T))). 
# Unlike in-class assignment 2, in these data the number of choice tasks per individual is NOT the same for all individuals. Therefore, we added a variable in the data called "T". When T is taken to the power of the number of rows per individual, it generates the number of choice tasks per individual.
# For example, suppose an individual has 3 choice tasks. Then, the value of PanelLikelihoodTrajectory(T) must equal 3. To accomplish this, T = 1.732051. When T is then taken to the power of 3, it equals 3.    
# The result is a the covariate for each individual available at the panel level (as opposed to the observation level).
nu_0 = delta_0 # Note: one class needs to be fixed to 0. delta_0 is fixed to 0
nu_0 = delta_0
nu_1 = delta_1 + gamma_gender_1 * PanelLikelihoodTrajectory(GENDER)**(1/(PanelLikelihoodTrajectory(T)))

# Put membership functions in a list
nu = [nu_0, nu_1]

# Estimate the LC model
results_LC = estimate_LC(V,AV,nu,CHOICE, biodata_panel,model_name)

# Print the results
print_results(results_LC)