# Part 1 - Reading in the data

## Part 1.1

In [132]:
import glob
import pandas as pd
import matplotlib.pyplot as plt

# Define the path to the data files
DATA_PATH = './data'
fn = f'{DATA_PATH}/SCE*.csv'

dfs=[]

# Loop through files and read them into a list of DataFrames
for i in glob.glob(fn):
    d = pd.read_csv(i, parse_dates=['date'],sep=';')
    dfs.append(d)

# Concatenate all DataFrames into a single DataFrame, and convert 'date' column to datetime
data = pd.concat(dfs)
data

Unnamed: 0,userid,wid,date,weight,female,educ,age,hispanic,black,couple,...,num_lit_q3,num_lit_q3_correct,num_lit_q5,num_lit_q5_correct,num_lit_q6,num_lit_q6_correct,num_lit_q8,num_lit_q8_correct,num_lit_q9,num_lit_q9_correct
0,70000238,201309,2013-09-05,0.8,0.0,3.0,74.0,0.0,0.0,,...,,,,,,,,,,
1,70000239,201309,2013-09-03,0.4,1.0,4.0,67.0,0.0,0.0,,...,,,,,,,,,,
2,70000312,201309,2013-09-04,0.4,0.0,4.0,50.0,0.0,0.0,,...,,,,,,,,,,
3,70000327,201309,2013-09-03,1.2,0.0,3.0,58.0,0.0,0.0,,...,,,,,,,,,,
4,70000337,201309,2013-09-04,5.1,1.0,2.0,40.0,0.0,0.0,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1266,75008816,202209,2022-09-01,0.6,1.0,4.0,33.0,0.0,0.0,1.0,...,10.0,1.0,100.0,1.0,5.0,1.0,3.0,1.0,2.0,1.0
1267,75008817,202209,2022-09-12,1.2,0.0,3.0,71.0,0.0,0.0,0.0,...,100.0,0.0,100.0,1.0,5.0,1.0,3.0,1.0,2.0,1.0
1268,75008829,202209,2022-09-02,0.9,0.0,3.0,35.0,0.0,0.0,1.0,...,10.0,1.0,10.0,0.0,2.0,0.0,3.0,1.0,2.0,1.0
1269,75008832,202209,2022-09-10,0.5,1.0,4.0,31.0,0.0,0.0,1.0,...,10.0,1.0,100.0,1.0,5.0,1.0,3.0,1.0,2.0,1.0


## Part 1.2

In [123]:
# Finding number of unique users
unique_ids = data['userid'].nunique()
print(f'Number of unique users: {unique_ids}')

# Finding number of data rows
data_rows = data.shape[0]
print(f'Number of data rows: {data_rows}')

# Finding number of unique waves
unique_waves = data['wid'].nunique()
print(f'Number of unique waves: {unique_waves}')

# Finding date range
first_date = data['date'].min()
last_date = data['date'].max()

# Printing first and last dates observed
print(f'Data ranges from {first_date.date()} to {last_date.date()}')

Number of unique users: 23369
Number of data rows: 176101
Number of unique waves: 139
Data ranges from 2013-06-01 to 2024-12-31


# Part 2

## Part 2.1

In [128]:
# Sort by 'userid' and 'date'
data = data.sort_values(["userid", "date"], ignore_index=True)

#Could have looped through columns but this is clearer
num_cols = [c for c in data.columns if "num_lit_" in c and "_correct" in c]

# Fill missing values with the first observed value per user
first_values_per_user = data.groupby("userid")[num_cols].transform("first")
data[num_cols] = data[num_cols].fillna(first_values_per_user)
data

Unnamed: 0,userid,wid,date,weight,female,educ,age,hispanic,black,couple,...,num_lit_q3,num_lit_q3_correct,num_lit_q5,num_lit_q5_correct,num_lit_q6,num_lit_q6_correct,num_lit_q8,num_lit_q8_correct,num_lit_q9,num_lit_q9_correct
0,70057317,201504,2015-04-10,0.5,0.0,4.0,70.0,0.0,0.0,1.0,...,10.0,1.0,10.0,0.0,5.0,1.0,3.0,1.0,2.0,1.0
1,70057321,201504,2015-04-05,0.5,1.0,4.0,72.0,0.0,0.0,0.0,...,10.0,1.0,100.0,1.0,5.0,1.0,3.0,1.0,2.0,1.0
2,70057321,201505,2015-05-11,0.5,1.0,4.0,72.0,0.0,0.0,0.0,...,,1.0,,1.0,,1.0,,1.0,,1.0
3,70057321,201506,2015-06-20,0.4,1.0,4.0,72.0,0.0,0.0,0.0,...,,1.0,,1.0,,1.0,,1.0,,1.0
4,70057321,201507,2015-07-22,0.4,1.0,4.0,72.0,0.0,0.0,0.0,...,,1.0,,1.0,,1.0,,1.0,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137571,75025299,202412,2024-12-19,0.6,1.0,3.0,33.0,0.0,0.0,1.0,...,10.0,1.0,100.0,1.0,5.0,1.0,2.0,0.0,2.0,1.0
137572,75025320,202412,2024-12-05,0.8,1.0,4.0,56.0,1.0,0.0,0.0,...,10.0,1.0,100.0,1.0,5.0,1.0,3.0,1.0,2.0,1.0
137573,75025337,202412,2024-12-21,1.0,1.0,3.0,68.0,0.0,0.0,1.0,...,10.0,1.0,100.0,1.0,5.0,1.0,3.0,1.0,2.0,1.0
137574,75025373,202412,2024-12-09,2.4,1.0,2.0,58.0,0.0,0.0,0.0,...,10.0,1.0,100.0,1.0,1.0,0.0,3.0,1.0,2.0,1.0


## Part 2.2

In [133]:
# Dropping rows with missing values in specified subsets of columns
demo = ['age', 'female', 'educ']
expectation = ['inflation', 'house_price_change', 'prob_stocks_up']

subsets = [
    ("demo", demo),
    ("expectation", expectation),
    ("numeracy", num_cols)
]

for name, ss in subsets:
    before = data.shape[0]
    data = data.dropna(subset=ss)
    after = data.shape[0]
    print(f"Rows before: {before}, after dropping NAs in {name}: {after} (dropped {before - after})")


Rows before: 176101, after dropping NAs in demo: 175233 (dropped 868)
Rows before: 175233, after dropping NAs in expectation: 173550 (dropped 1683)
Rows before: 173550, after dropping NAs in numeracy: 17594 (dropped 155956)
