# Cars: Getting Started

In [13]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
import pyblp
sns.set_theme()

pyblp.options.digits = 2
pyblp.options.verbose = False

# Read in data

The dataset, `cars.csv`, contains cleaned and processed data. If you want to make changes, the notebook, `materialize.ipynb`, creates the data from the raw source datsets. 

In [2]:
cars = pd.read_csv('cars.csv') # this reads the *balanced* dataset (i.e. J = 40 products per market always)
# cars = pd.read_excel('cars.xlsx') # this reads the *unbalanced* dataset (i.e. J varies over time)

### No data for France pre 1990. Average growth in adult fraction from other countries applied each year before

In [3]:
AdultFrac = pd.read_excel("FracOver20.xlsx", index_col = 0)
cars['adults'] = None
for idx in cars.index:
    cars['adults'][idx] = AdultFrac[cars['ma'][idx]][cars['ye'][idx]]
cars

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cars['adults'][idx] = AdultFrac[cars['ma'][idx]][cars['ye'][idx]]


Unnamed: 0,ye,ma,co,zcode,brd,type,brand,model,org,loc,...,tax,pop,ngdp,rgdp,engdp,ergdp,engdpc,ergdpc,inc,adults
0,70,1,15,14,2,audi 100/200,audi,100/200,2,4,...,0.250,9660000,1280999948288,3.940725e+12,25795239936,7.935358e+10,2670.314697,8214.65625,20363.7252,0.688912
1,70,1,26,35,4,citroen 2 CV 6 - 2 CV 4,citroen,2CV6,1,3,...,0.250,9660000,1280999948288,3.940725e+12,25795239936,7.935358e+10,2670.314697,8214.65625,20363.7252,0.688912
2,70,1,36,36,4,citroen dyane,citroen,dyane,1,3,...,0.250,9660000,1280999948288,3.940725e+12,25795239936,7.935358e+10,2670.314697,8214.65625,20363.7252,0.688912
3,70,1,64,67,7,fiat 128,fiat,128,3,5,...,0.250,9660000,1280999948288,3.940725e+12,25795239936,7.935358e+10,2670.314697,8214.65625,20363.7252,0.688912
4,70,1,71,80,8,ford escort,ford,escort,2,4,...,0.250,9660000,1280999948288,3.940725e+12,25795239936,7.935358e+10,2670.314697,8214.65625,20363.7252,0.688912
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,99,5,914,161,16,mercedes A,mercedes,A,2,4,...,0.175,58200000,889870024704,,1052244508672,,18079.802734,,32434.8905,0.745551
5996,99,5,936,80,8,ford focus,ford,focus,7,4,...,0.175,58200000,889870024704,,1052244508672,,18079.802734,,32434.8905,0.745551
5997,99,5,951,208,20,peugeot 206,peugeot,206,1,3,...,0.175,58200000,889870024704,,1052244508672,,18079.802734,,32434.8905,0.745551
5998,99,5,953,282,29,toyota avensis,toyota,avensis,4,12,...,0.175,58200000,889870024704,,1052244508672,,18079.802734,,32434.8905,0.745551


In [4]:
lbl_vars = pd.read_csv('labels_variables.csv', index_col=0)
lbl_vals = pd.read_stata('cars.dta', iterator=True).value_labels() # the values that variables take (not relevant for all )

## Overview of the dataset

In [5]:
pd.set_option('display.max_colwidth', None)
tab = cars.mean(numeric_only=True).apply(lambda x: f'{x:.2f}').to_frame('Mean').join(lbl_vars)
tab

Unnamed: 0,Mean,label
ye,84.5,year (=first dimension of panel)
ma,3.0,market (=second dimension of panel)
co,207.5,model code (=third dimension of panel)
zcode,177.76,alternative model code (predecessors and successors get same number)
brd,16.79,brand code
org,2.72,"origin code (demand side, country with which consumers associate model)"
loc,5.17,"location code (production side, country where producer produce model)"
cla,2.3,class or segment code
home,0.32,domestic car dummy (appropriate interaction of org and ma)
frm,14.5,firm code


# Set up for analysis

## Price variables 

Can be either price (`pr`), price-to-income (`princ`), or log price (`logp`, created below).

In [6]:
price_var = 'eurpr'

In [7]:
cars['logp'] = np.log(cars[price_var])

## Market share

**Todo:** Decide how to measure the market size and thereby the market share. *Note:* Below is just an example that sets the market size = population / 3. 

In [8]:
# total quantity of cars sold in market-year (ma, ye)
cars['qu_tot'] = cars.groupby(['ma', 'ye'])['qu'].transform('sum')
cars['market_size'] = cars['pop'] * cars['adults']
cars['s'] = cars['qu'] / cars['market_size']

In [9]:
# compute the share of the outside good (will be useful for the demand inversion)
cars['s0'] = 1.0 - cars.groupby(['ma', 'ye'])['s'].transform('sum')
print(f'Outside share is from {cars.s0.min():.1%} to {cars.s0.max():.1%}')

Outside share is from 94.4% to 97.8%


In [10]:
cars.groupby(['ma'])['s'].describe().rename(index=lbl_vals['market']).style.format('{:.3f}')

Unnamed: 0_level_0,count,unique,top,freq
ma,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Belgium,1200.0,1194.0,0.002,2.0
France,1200.0,1199.0,0.001,2.0
Germany,1200.0,1199.0,0.0,2.0
Italy,1200.0,1195.0,0.0,2.0
UK,1200.0,1199.0,0.0,2.0


## 1. Using canned software

In [11]:
from linearmodels.iv import IV2SLS

In [15]:
cars.head(5)

Unnamed: 0,ye,ma,co,zcode,brd,type,brand,model,org,loc,...,ergdp,engdpc,ergdpc,inc,adults,logp,qu_tot,market_size,s,s0
0,70,1,15,14,2,audi 100/200,audi,100/200,2,4,...,79353580000.0,2670.314697,8214.65625,20363.7252,0.688912,8.012351,239013,6654892.907504,0.000406,0.964085
1,70,1,26,35,4,citroen 2 CV 6 - 2 CV 4,citroen,2CV6,1,3,...,79353580000.0,2670.314697,8214.65625,20363.7252,0.688912,7.095851,239013,6654892.907504,0.000526,0.964085
2,70,1,36,36,4,citroen dyane,citroen,dyane,1,3,...,79353580000.0,2670.314697,8214.65625,20363.7252,0.688912,7.175974,239013,6654892.907504,0.001007,0.964085
3,70,1,64,67,7,fiat 128,fiat,128,3,5,...,79353580000.0,2670.314697,8214.65625,20363.7252,0.688912,7.472758,239013,6654892.907504,0.001172,0.964085
4,70,1,71,80,8,ford escort,ford,escort,2,4,...,79353580000.0,2670.314697,8214.65625,20363.7252,0.688912,7.39854,239013,6654892.907504,0.001428,0.964085


In [17]:
cars['delta'] = np.log(cars['s'] / cars['s0'])

TypeError: loop of ufunc does not support argument 0 of type numpy.float64 which has no callable log method

In [12]:
cars["brand"].replace('alfa romeo', 'alfa_romeo', inplace=True)
cars["brand"] = cars["brand"].str.replace('/', '', regex=False)

In [13]:
categorical_var = 'brand' # name of categorical variable
dummies = pd.get_dummies(cars[categorical_var]) # creates a matrix of dummies for each value of dummyvar
x_vars_dummies = list(dummies.columns[1:].values) # omit a reference category, here it is the first (hence columns[1:])

# add dummies to the dataframe 
assert dummies.columns[0] not in cars.columns, f'It looks like you have already added this dummy to the dataframe. Avoid duplicates! '
cars = pd.concat([cars,dummies], axis=1)

In [None]:
# choose your preferred variables 
x_vars = ['logp', 'home', 'cy', 'hp', 'we', 'li', 'sp'] + x_vars_dummies # <--- !!! CHOOSE HERE 
print(x_vars)

In [None]:
formula = 'delta ~ 1'
for x_ in x_vars:
    formula += ' + ' + x_
print(formula)
model = IV2SLS.from_formula(formula, cars).fit()

In [None]:
model.summary

## 2. Using numpy

***WARNING:*** The code below works *only* with a *balanced* dataset (i.e. with the same number of products, $J$ for each market (`(ma,ye)` pair.))

### Dummy variables

It can be very important to control for some fixed effects. To do this with matrices, you will have to create dummy variables with one column for each possible value (except one for the reference category). 


### `x_vars`: List of regressors to be used 

In [17]:
K = len(x_vars)
N = cars.ma.nunique() * cars.ye.nunique()
J = 40 
x = cars[x_vars].values.reshape((N,J,K)).astype(float)
cars['outcome'] = cars['s'] / cars['s0']
y = np.log(cars['outcome'].values.reshape((N,J)))

# standardize x
x = ((x - x.mean(0).mean(0))/(x.std(0).std(0)))

# OLS Example

Let's compute the OLS estimator just to test that we can do algebra with the arrays. 

***Note:*** This particular choice of $y$ and $x$ variables might not make sense, it is just to help you get started doing algebra on these arrays. 

In [18]:
Y = y.reshape(N*J,) # Make Y 1-dimensional 
X = np.hstack([x.reshape(N*J,K), np.ones((N*J,1))]) # append a constant term 

In [None]:
# compute the OLS estimator 
bet = np.linalg.inv(X.T @ X) @ X.T @ Y

# print
varnames = x_vars + ['const'] # we added the constant as the K+1'th column 
pd.DataFrame({'Estimate':bet}, index=varnames)

# Towards non-linear estimation

In order to work with the logit model, you have to be able to compute the utility indices, which typically take the form of some inner product of an $x$-vector and a $\theta$ vector. This is illustrated for you below. Since `x` is `(N,J,K)` (i.e. `x[i,j,:]` gives the $K$-vector of regressors for the car `j` in market-period `i`), we just have to form the matrix product `x @ theta`, and Python will do the sum over the 3rd dimension of `x`. 

In [None]:
theta0 = np.zeros((K,))
v = x @ theta0 # how to multiply a trial value with the matrix of regressors 
np.exp(v) / np.sum(np.exp(v), 1, keepdims=True) # choice probabilities 