# Wage Data Generation
## Instructor: Dr. Smith | Econometrics 301

*This notebook generates and saves synthetic wage data with:*
- *Strong signals for true predictors*
- *Multiple irrelevant variables*
- *Clear overfitting potential*

In [12]:
import numpy as np
import pandas as pd
import os
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## 1. Data Generation

In [13]:
np.random.seed(42)
n_samples = 1000

# True predictors
educ = np.random.normal(14, 2, n_samples).astype(np.float32)
exper = np.abs(np.random.normal(10, 5, n_samples)).astype(np.float32)
ability = np.random.normal(0, 1, n_samples).astype(np.float32)
female = np.random.binomial(1, 0.45, n_samples).astype(np.float32)
union = np.random.binomial(1, 0.2, n_samples).astype(np.float32)

# Irrelevant variables
height = np.random.normal(170, 10, n_samples).astype(np.float32)
commute_km = np.round(np.random.exponential(scale=10, size=n_samples), 1)
birth_month = np.random.randint(1, 13, n_samples).astype(np.float32)
favorite_number = np.random.randint(1, 10, n_samples).astype(np.float32)
clothing_colors = np.random.choice(['black', 'blue', 'gray', 'white', 'other'], 
                                 size=n_samples, p=[0.3, 0.25, 0.2, 0.15, 0.1])

# True coefficients
true_coeffs = {
    'const': 2.0, 'educ': 0.15, 
    'exper': 0.06, 'exper_sq': -0.001,
    'ability': 0.25, 
    'female': -0.3, 
    'union': 0.35
}

X = pd.DataFrame({
    'educ': educ, 'exper': exper, 'exper_sq': (exper**2).astype(np.float32),
    'ability': ability, 'female': female, 'union': union,
    'height': height, 'commute_km': commute_km,
    'birth_month': birth_month, 'favorite_number': favorite_number,
    'clothing_color': clothing_colors
})

log_wage = (
    true_coeffs['const'] +
    true_coeffs['educ'] * X['educ'] +
    true_coeffs['exper'] * X['exper'] +
    true_coeffs['exper_sq'] * X['exper_sq'] +
    true_coeffs['ability'] * X['ability'] +
    true_coeffs['female'] * X['female'] +
    true_coeffs['union'] * X['union'] +
    np.random.normal(0, 0.5, n_samples).astype(np.float32))

## 2. Data Saving

In [14]:
# Combine and save data
data = X.copy()
data['log_wage'] = log_wage
data.to_csv('DATA/wage_data.csv', index=False)
print("Data successfully saved to 'wage_data.csv'")
print(f"File location: {os.path.abspath('DATA/wage_data.csv')}")
print("\nData preview:")
display(data.head())

Data successfully saved to 'wage_data.csv'
File location: /Users/qingfengliu/Library/CloudStorage/Dropbox/Hosei_University/All_Lecture/Python/ML_Toolkit/DATA/wage_data.csv

Data preview:


Unnamed: 0,educ,exper,exper_sq,ability,female,union,height,commute_km,birth_month,favorite_number,clothing_color,log_wage
0,14.993428,16.996777,288.890411,-0.675178,1.0,1.0,178.089188,1.7,1.0,8.0,blue,4.978068
1,13.723472,14.623168,213.837036,-0.144519,1.0,0.0,164.155441,2.0,4.0,4.0,gray,4.968439
2,15.295377,10.298152,106.051933,-0.79242,0.0,1.0,162.506424,4.5,10.0,8.0,white,5.270698
3,17.046061,6.765316,45.769501,-0.307962,1.0,0.0,167.784637,5.7,7.0,9.0,blue,4.455431
4,13.531693,13.491117,182.010223,-1.893615,0.0,0.0,158.230377,9.8,2.0,7.0,gray,4.859989
