# Cars: Getting Started

In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
import pyblp
sns.set_theme()
import matplotlib.pyplot as plt

pyblp.options.digits = 2
pyblp.options.verbose = False

# Read in data

The dataset, `cars.csv`, contains cleaned and processed data. If you want to make changes, the notebook, `materialize.ipynb`, creates the data from the raw source datsets. 

In [3]:
cars = pd.read_csv('cars.csv') # this reads the *balanced* dataset (i.e. J = 40 products per market always)
# cars = pd.read_excel('cars.xlsx') # this reads the *unbalanced* dataset (i.e. J varies over time)

### No data for France pre 1990. Average growth in adult fraction from other countries applied each year before

In [4]:
AdultFrac = pd.read_excel("FracOver20.xlsx", index_col = 0)
cars['adults'] = None
for idx in cars.index:
    cars['adults'][idx] = AdultFrac[cars['ma'][idx]][cars['ye'][idx]]
cars

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  cars['adults'][idx] = AdultFrac[cars['ma'][idx]][cars['ye'][idx]]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

Unnamed: 0,ye,ma,co,zcode,brd,type,brand,model,org,loc,...,tax,pop,ngdp,rgdp,engdp,ergdp,engdpc,ergdpc,inc,adults
0,70,1,15,14,2,audi 100/200,audi,100/200,2,4,...,0.250,9660000,1280999948288,3.940725e+12,25795239936,7.935358e+10,2670.314697,8214.65625,20363.7252,0.688912
1,70,1,26,35,4,citroen 2 CV 6 - 2 CV 4,citroen,2CV6,1,3,...,0.250,9660000,1280999948288,3.940725e+12,25795239936,7.935358e+10,2670.314697,8214.65625,20363.7252,0.688912
2,70,1,36,36,4,citroen dyane,citroen,dyane,1,3,...,0.250,9660000,1280999948288,3.940725e+12,25795239936,7.935358e+10,2670.314697,8214.65625,20363.7252,0.688912
3,70,1,64,67,7,fiat 128,fiat,128,3,5,...,0.250,9660000,1280999948288,3.940725e+12,25795239936,7.935358e+10,2670.314697,8214.65625,20363.7252,0.688912
4,70,1,71,80,8,ford escort,ford,escort,2,4,...,0.250,9660000,1280999948288,3.940725e+12,25795239936,7.935358e+10,2670.314697,8214.65625,20363.7252,0.688912
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,99,5,914,161,16,mercedes A,mercedes,A,2,4,...,0.175,58200000,889870024704,,1052244508672,,18079.802734,,32434.8905,0.745551
5996,99,5,936,80,8,ford focus,ford,focus,7,4,...,0.175,58200000,889870024704,,1052244508672,,18079.802734,,32434.8905,0.745551
5997,99,5,951,208,20,peugeot 206,peugeot,206,1,3,...,0.175,58200000,889870024704,,1052244508672,,18079.802734,,32434.8905,0.745551
5998,99,5,953,282,29,toyota avensis,toyota,avensis,4,12,...,0.175,58200000,889870024704,,1052244508672,,18079.802734,,32434.8905,0.745551


We estimate that 77% of the adult population have a driving license for a full-car. Hence, the share of population aged 20+ with a driver becomes:

In [7]:
license_share = 0.77
cars["ad_w_li"] = cars["adults"] * license_share 

In [14]:
lbl_vars = pd.read_csv('labels_variables.csv', index_col=0)
lbl_vals = pd.read_stata('cars.dta', iterator=True).value_labels() # the values that variables take (not relevant for all )

## Overview of the dataset

In [15]:
pd.set_option('display.max_colwidth', None)
tab = cars.mean(numeric_only=True).apply(lambda x: f'{x:.2f}').to_frame('Mean').join(lbl_vars)
tab

Unnamed: 0,Mean,label
ye,84.5,year (=first dimension of panel)
ma,3.0,market (=second dimension of panel)
co,207.5,model code (=third dimension of panel)
zcode,177.76,alternative model code (predecessors and successors get same number)
brd,16.79,brand code
org,2.72,"origin code (demand side, country with which consumers associate model)"
loc,5.17,"location code (production side, country where producer produce model)"
cla,2.3,class or segment code
home,0.32,domestic car dummy (appropriate interaction of org and ma)
frm,14.5,firm code


# Set up for analysis

## Price variables 

Can be either price (`pr`), price-to-income (`princ`), or log price (`logp`, created below).

In [16]:
price_var = 'eurpr'

In [17]:
cars['logp'] = np.log(cars[price_var])

## Market share

**Todo:** Decide how to measure the market size and thereby the market share. *Note:* Below is just an example that sets the market size = population / 3. 

In [18]:
# total quantity of cars sold in market-year (ma, ye)
cars['qu_tot'] = cars.groupby(['ma', 'ye'])['qu'].transform('sum')
cars['market_size'] = cars['pop'] * cars['ad_w_li']
cars['s'] = cars['qu'] / cars['market_size']

In [19]:
# compute the share of the outside good (will be useful for the demand inversion)
cars['s0'] = 1.0 - cars.groupby(['ma', 'ye'])['s'].transform('sum')
print(f'Outside share is from {cars.s0.min():.1%} to {cars.s0.max():.1%}')

Outside share is from 92.7% to 97.1%


In [20]:
cars.groupby(['ma'])['s'].describe().rename(index=lbl_vals['market']).style.format('{:.3f}')

Unnamed: 0_level_0,count,unique,top,freq
ma,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Belgium,1200.0,1194.0,0.003,2.0
France,1200.0,1199.0,0.001,2.0
Germany,1200.0,1199.0,0.0,2.0
Italy,1200.0,1195.0,0.0,2.0
UK,1200.0,1199.0,0.0,2.0


## 1. Using canned software

In [21]:
from linearmodels.iv import IV2SLS

In [22]:
cars['delta'] = cars['s'] / cars['s0']
cars['delta'] = np.log(cars['delta'].values.astype(float)) ## Den stoppede med at ville gøre det i et skridt uden at definere type

In [23]:
cars["brand"].replace('alfa romeo', 'alfa_romeo', inplace=True)
cars["brand"] = cars["brand"].str.replace('/', '', regex=False)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  cars["brand"].replace('alfa romeo', 'alfa_romeo', inplace=True)


In [24]:
categorical_var = 'brand' # name of categorical variable
dummies = pd.get_dummies(cars[categorical_var]) # creates a matrix of dummies for each value of dummyvar
x_vars_dummies = list(dummies.columns[1:].values) # omit a reference category, here it is the first (hence columns[1:])

# add dummies to the dataframe 
assert dummies.columns[0] not in cars.columns, f'It looks like you have already added this dummy to the dataframe. Avoid duplicates! '
cars = pd.concat([cars,dummies], axis=1)

In [46]:
# Calculate the average price of all other cars in a given year in a given market:
# Step 1: Calculate the sum and count of prices for each year and market
cars['sum_eurpr_ye_ma'] = cars.groupby(['ye', 'ma'])['eurpr'].transform('sum')
cars['count_ye_ma'] = cars.groupby(['ye', 'ma'])['eurpr'].transform('count')

# Step 2: Calculate the average price excluding the current observation
cars['avg_eurpr_excl'] = (cars['sum_eurpr_ye_ma'] - cars['eurpr']) / (cars['count_ye_ma'] - 1)

# Drop the intermediate columns if they are no longer needed
cars.drop(columns=['sum_eurpr_ye_ma', 'count_ye_ma'], inplace=True)


cars['avg_eurpr_excl'] = np.log(cars['avg_eurpr_excl'])

In [47]:
# choose your preferred variables 
x_vars = ['logp', 'avg_eurpr_excl', 'home', 'cy', 'hp', 'we', 'li', 'sp'] + x_vars_dummies # <--- !!! CHOOSE HERE 
print(x_vars)

['logp', 'avg_eurpr_excl', 'home', 'cy', 'hp', 'we', 'li', 'sp', 'MCC', 'VW', 'alfa_romeo', 'audi', 'citroen', 'daewoo', 'daf', 'fiat', 'ford', 'honda', 'hyundai', 'innocenti', 'lancia', 'mazda', 'mercedes', 'mitsubishi', 'nissan', 'opel', 'peugeot', 'renault', 'rover', 'saab', 'seat', 'skoda', 'suzuki', 'talbot', 'talhillman', 'talmatra', 'talsimca', 'talsunb', 'toyota', 'volvo']


In [48]:
formula = 'delta ~ 1'
for x_ in x_vars:
    formula += ' + ' + x_
print(formula)
model = IV2SLS.from_formula(formula, cars).fit()

delta ~ 1 + logp + avg_eurpr_excl + home + cy + hp + we + li + sp + MCC + VW + alfa_romeo + audi + citroen + daewoo + daf + fiat + ford + honda + hyundai + innocenti + lancia + mazda + mercedes + mitsubishi + nissan + opel + peugeot + renault + rover + saab + seat + skoda + suzuki + talbot + talhillman + talmatra + talsimca + talsunb + toyota + volvo


Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(


In [49]:
model.summary

0,1,2,3
Dep. Variable:,delta,R-squared:,0.4055
Estimator:,OLS,Adj. R-squared:,0.4015
No. Observations:,5998,F-statistic:,1.503e+05
Date:,"Fri, Oct 18 2024",P-value (F-stat),0.0000
Time:,12:56:59,Distribution:,chi2(40)
Cov. Estimator:,robust,,
,,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Intercept,-9.2178,0.2581,-35.720,0.0000,-9.7236,-8.7121
logp,-0.5163,0.0884,-5.8391,0.0000,-0.6896,-0.3430
avg_eurpr_excl,0.5437,0.0804,6.7642,0.0000,0.3862,0.7013
home,1.0383,0.0233,44.575,0.0000,0.9927,1.0840
cy,-0.0001,7.758e-05,-1.8570,0.0633,-0.0003,7.984e-06
hp,-0.0289,0.0022,-13.059,0.0000,-0.0333,-0.0246
we,0.0008,0.0001,5.2809,0.0000,0.0005,0.0011
li,-0.0128,0.0114,-1.1249,0.2606,-0.0351,0.0095
sp,0.0185,0.0017,10.692,0.0000,0.0151,0.0218


In [19]:
K = len(x_vars)
N = cars.ma.nunique() * cars.ye.nunique()
J = 40 
x = cars[x_vars].values.reshape((N,J,K)).astype(float)
cars['outcome'] = cars['s'] / cars['s0']
y = cars['delta'].values.reshape((N,J))
# standardize x
x = ((x - x.mean(0).mean(0))/(x.std(0).std(0)))

# Towards non-linear estimation

In order to work with the logit model, you have to be able to compute the utility indices, which typically take the form of some inner product of an $x$-vector and a $\theta$ vector. This is illustrated for you below. Since `x` is `(N,J,K)` (i.e. `x[i,j,:]` gives the $K$-vector of regressors for the car `j` in market-period `i`), we just have to form the matrix product `x @ theta`, and Python will do the sum over the 3rd dimension of `x`. 

In [None]:
theta0 = np.zeros((K,))
v = x @ theta0 # how to multiply a trial value with the matrix of regressors 
np.exp(v) / np.sum(np.exp(v), 1, keepdims=True) # choice probabilities 