In [78]:
import subprocess
import sys
import os

# Define a function to install packages
def install(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# List of packages to install
packages = [
    "pandas",
    "numpy",
    "scikit-learn",
    "pickle-mixin",
    "pyarrow"
]

# Install each package
for package in packages:
    try:
        __import__(package)
    except ImportError:
        install(package)

#assign correct directory
name = sys.platform

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
import pickle
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor 
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import LinearSVC
import random as rm
from sklearn.metrics import classification_report, f1_score
import pickle as pk
from sklearn.preprocessing import PolynomialFeatures
from itertools import combinations
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import Lasso
from sklearn.ensemble import IsolationForest
from sklearn.metrics import accuracy_score
from scipy.spatial.distance import mahalanobis



In [79]:
import pandas as pd

# 1. Load the Stata dataset
df = pd.read_stata("CMF/CMF_1880_general_schedule.dta")

In [80]:
for col in df.columns:
    if "workers" in col:
        print(col)
    if "hour" in col:
        print(col)
    if "wage" in col:
        print(col)
    if "material" in col:
        print(col)
    if "product" in col: 
        print(col)
    if "capital" in col:
        print(col)
    if "file" in col:
        print(col)
    if "firm" in col:
        print(col)

file_name
firm_number
firm_name
capital
workers_max
workers_adult_male
workers_adult_female
workers_children
hours_may_nov
hours_nov_may
wage_daily_skilled
wage_daily_unskilled
total_wages
material_total_value
product_total_value


In [81]:
for col in df.columns:
    if col.startswith("workers") or col == "material_total_value" or col == "product_total_value" or col == "capital" or col == "total_wages":
        df[col] = pd.to_numeric(df[col], errors='coerce')

# rename prodcut_total_value to output
df.rename(columns={'product_total_value': 'output'}, inplace=True)
# rename material_total_value to materials_value
df.rename(columns={'material_total_value': 'materials_value'}, inplace=True)

# First build a list of columns to keep:
base_keep = [
    'file_name', 'firm_number', 'firm_name', 'capital', 'workers_adult_female', 'workers_adult_male', 'workers_children', 
    'total_wages', 'materials_value', 'output'
]

# Add any columns starting with "ind_"
ind_cols = [col for col in df.columns if col.startswith("ind_")]

# Retain only these columns (if they exist in the dataframe)
cols_to_keep = base_keep + ind_cols
df = df[cols_to_keep]

In [82]:
front_cols = ['file_name', 'firm_number', 'firm_name', 'capital', 'workers_adult_male', 'workers_adult_female', 'workers_children', 
              'total_wages', 'materials_value', 'output']
remaining_cols = [col for col in df.columns if col not in front_cols]
df = df[front_cols + remaining_cols]

output_file = r"CMF_to_predict/1880_general_schedule_to_predict.dta"
df.to_stata(output_file, write_index=False)

print("Data processing complete. Output saved to:", output_file)

Data processing complete. Output saved to: CMF_to_predict/1880_general_schedule_to_predict.dta


In [83]:
# Read data
input_1880_general_schedule = pd.read_stata("CMF_to_predict/1880_general_schedule_to_predict.dta")
# get the columns
input_1880_general_schedule.columns

Index(['file_name', 'firm_number', 'firm_name', 'capital',
       'workers_adult_male', 'workers_adult_female', 'workers_children',
       'total_wages', 'materials_value', 'output', 'ind_detailed',
       'ind_broadest', 'ind_leontief', 'ind_granular'],
      dtype='object')

In [84]:
def ratio(df, var1, var2):
    df[f'{var1}&{var2}'] = df[var1]/df[var2]

def ratio_tot_wages(df, var1, var2, var3, var4):
    df[f'{var1}&{var2}&{var3}&{var4}'] = (df[var1] + df[var2]+ df[var3])/df[var4]

In [85]:
ratio(input_1880_general_schedule, 'materials_value', 'output')
ratio(input_1880_general_schedule, 'materials_value', 'capital')
ratio(input_1880_general_schedule, 'capital', 'output')
ratio(input_1880_general_schedule, 'capital', 'materials_value')
ratio(input_1880_general_schedule, 'output', 'materials_value')
ratio(input_1880_general_schedule, 'output', 'capital')
ratio_tot_wages(input_1880_general_schedule, 'workers_adult_male', 'workers_adult_female', 'workers_children', 'total_wages')
ratio(input_1880_general_schedule, 'total_wages', 'output')

var_list = ["capital", "materials_value", "workers_adult_female", "workers_adult_male", "total_wages", "output", 'materials_value&output', 'materials_value&capital', 'capital&output', 'capital&materials_value', 'output&materials_value', 'output&capital','workers_adult_male&workers_adult_female&workers_children&total_wages', 'total_wages&output']

input_1880_general_schedule.replace({np.inf: 0, -np.inf: 0, np.nan: 0}, inplace=True)

In [86]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import mahalanobis

industry_list = input_1880_general_schedule["ind_broadest"].unique().tolist()
final_df = []

for industry in industry_list:
    input_df = input_1880_general_schedule[input_1880_general_schedule["ind_broadest"] == industry].copy()  # Ensure a copy

    if input_df.empty:
        continue  # Skip if there's no data for this industry

    # Compute mean and covariance matrix
    mean_vec = np.mean(input_df[var_list], axis=0)
    cov_matrix = np.cov(input_df[var_list], rowvar=False)

    # Check if covariance matrix is singular
    if np.linalg.det(cov_matrix) == 0:
        print(f"Skipping {industry}: Covariance matrix is singular")
        continue

    inv_cov_matrix = np.linalg.inv(cov_matrix)

    # Compute Mahalanobis distance for each firm
    distances = np.array([mahalanobis(row, mean_vec, inv_cov_matrix) for row in input_df[var_list].to_numpy()])
    
    # Compute contributions
    contributions = np.array([(row - mean_vec) * np.dot(inv_cov_matrix, (row - mean_vec)) for row in input_df[var_list].to_numpy()])
    contributions_df = pd.DataFrame(contributions, columns=[col + "_con" for col in var_list])

    # Identify the most anomalous variable for each row
    input_df['most_anomalous_var'] = contributions_df.idxmax(axis=1)
    input_df['mahalanobis'] = distances

    # Combine input_df with contributions_df (ensuring correct alignment)
    input_df = pd.concat([input_df.reset_index(drop=True), contributions_df.reset_index(drop=True)], axis=1)

    final_df.append(input_df)  # Append a COPY

# Combine all industry DataFrames into one
final_df = pd.concat(final_df, ignore_index=True)

In [87]:
final_df['dist_squared'] = final_df['mahalanobis'] ** 2
for var in var_list:
    final_df[var + "_divided"] = final_df[var+"_con"]/final_df['dist_squared']
final_df['total'] = 0
for var in var_list:
    final_df['total'] = final_df['total'] + final_df[var + "_divided"]
final_df

Unnamed: 0,file_name,firm_number,firm_name,capital,workers_adult_male,workers_adult_female,workers_children,total_wages,materials_value,output,...,output_divided,materials_value&output_divided,materials_value&capital_divided,capital&output_divided,capital&materials_value_divided,output&materials_value_divided,output&capital_divided,workers_adult_male&workers_adult_female&workers_children&total_wages_divided,total_wages&output_divided,total
0,NY_8_dutchess_00298_L.jpg,28,Stafford Stephen,400.0,1.0,0.0,0.0,450.0,700.0,2500.0,...,0.751112,0.030938,0.059964,-0.030097,0.105261,-0.000576,0.034465,0.004330,-0.003541,1.0
1,NY_8_orange_00051_L.jpg,7,Outwin Edward,200.0,0.0,0.0,0.0,0.0,200.0,600.0,...,0.577786,0.010137,0.106247,-0.019988,0.057687,0.000312,0.033475,0.004308,0.015988,1.0
2,OH_8_hamilton_00335_L.jpg,19,Hubbell & Fisher,1000.0,0.0,35.0,0.0,6500.0,11500.0,32500.0,...,-0.001121,0.000931,0.023645,-0.001080,0.004445,-0.000048,0.499937,0.000208,-0.000204,1.0
3,IL_8_adams_00022_L.jpg,4,Her. Schroeder,6000.0,2.0,0.0,0.0,900.0,1500.0,3500.0,...,0.213309,0.002576,0.149351,0.000553,0.071591,0.004361,0.322713,0.003902,0.000962,1.0
4,VA_8_bedford_00202_r_L.jpg,12,Curtis & Hatcher,2000.0,2.0,0.0,2.0,250.0,10000.0,15000.0,...,0.355088,0.004033,0.095908,-0.032855,0.139307,0.002289,0.108244,0.004292,0.031800,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
171278,PA_8_blair_00763_L.jpg,3,Wm Stokes,35000.0,33.0,0.0,0.0,9500.0,22000.0,50000.0,...,0.094861,0.000213,-0.015615,0.029907,-0.009256,0.016661,0.017834,0.000723,-0.026976,1.0
171279,NY_8_kings_00361_L.jpg,7,O.F. Hawley,150000.0,75.0,0.0,25.0,50000.0,250000.0,350000.0,...,0.261606,0.000835,0.000973,0.007115,0.000686,-0.000890,-0.001896,-0.000150,0.003051,1.0
171280,WA_8_mason_00029_L.jpg,2,Miller Joel,3315.0,15.0,0.0,0.0,5265.0,4540.0,13750.0,...,0.041826,-0.012552,0.074786,0.386856,-0.021672,0.016137,0.023970,0.003964,0.060911,1.0
171281,TN_8_cheatham_00166_r_L.jpg,7,Tyson Bell & Son,600.0,3.0,0.0,0.0,50.0,780.0,1170.0,...,0.149099,0.018359,-0.074516,0.198735,-0.084511,0.161476,0.123216,0.004336,0.240260,1.0


In [89]:
for var in var_list:
    batch = []
    for ind in industry_list:
        ind_specific = final_df[final_df["ind_broadest"] == ind].copy()  # Ensure a copy
        #ORIGINAL CUTOFF : .0001
        quantile_low = ind_specific[var + '_divided'].quantile(0.0001)
        quantile_high = ind_specific[var + '_divided'].quantile(1 - 0.0001)

        # Filter the lowest and highest 0.01% values
        lowest_values = ind_specific[ind_specific[var + '_divided'] <= quantile_low]
        highest_values = ind_specific[ind_specific[var + '_divided'] >= quantile_high]

        # Combine results
        result = pd.concat([lowest_values, highest_values])

        batch.append(result)

    batch = pd.concat(batch, ignore_index=True)

    if "materials_value&output" in var:
        output_var_list = ['file_name', 'firm_number', 'firm_name', 'capital', 'workers_adult_male', 'workers_adult_female', 'workers_children', 'total_wages', 'materials_value', 'output']
        variable_check = batch[output_var_list]
        variable_check['transcription_error'] =  None
        variable_check['correct_materials_value'] = None
        variable_check['correct_output'] = None
        variable_check.to_csv(f"check/1880/1880_general_schedule/{var}_check_1880_general_schedule.csv", index=False)
    elif "output&materials_value" in var:
        output_var_list = ['file_name', 'firm_number', 'firm_name', 'capital', 'workers_adult_male', 'workers_adult_female', 'workers_children', 'total_wages', 'materials_value', 'output']
        variable_check = batch[output_var_list]
        variable_check['transcription_error'] =  None
        variable_check['correct_output'] = None
        variable_check['correct_materials_value'] = None
        variable_check.to_csv(f"check/1880/1880_general_schedule/{var}_check_1880_general_schedule.csv", index=False)

    elif "materials_value" in var:
        output_var_list = ['file_name', 'firm_number', 'firm_name', 'capital', 'workers_adult_male', 'workers_adult_female', 'workers_children', 'total_wages', 'materials_value', 'output']
        variable_check = batch[output_var_list]
        variable_check['transcription_error'] =  None
        parts = var.split("&")
        if len(parts) == 2:
            left, right=parts
            if "materials_value" in left:
                variable_check['correct_materials_value'] = None
                variable_check[f'correct_{right}'] = None
            else:
                variable_check[f'correct_{left}'] = None
                variable_check['correct_materials_value'] = None
        else:
            variable_check['correct_materials_value'] = None
        variable_check.to_csv(f"check/1880/1880_general_schedule/{var}_check_1880_general_schedule.csv", index=False)

    elif "output" in var:
        output_var_list = ['file_name', 'firm_number', 'firm_name', 'capital', 'workers_adult_male', 'workers_adult_female', 'workers_children', 'total_wages', 'materials_value', 'output']
        variable_check = batch[output_var_list]
        variable_check['transcription_error'] =  None
        parts = var.split("&")
        if len(parts) == 2:
            left, right=parts
            if "output" in left:
                variable_check['correct_output_value'] = None
                variable_check[f'correct_{right}'] = None
            else:
                variable_check[f'correct_{left}'] = None
                variable_check['correct_output_value'] = None
        else:
            variable_check['correct_output_value'] = None
        variable_check.to_csv(f"check/1880/1880_general_schedule/{var}_check_1880_general_schedule.csv", index=False)

    else:
        parts = var.split("&")
        output_var_list = ['file_name', 'firm_number', 'firm_name', 'capital', 'workers_adult_male', 'workers_adult_female', 'workers_children', 'total_wages', 'materials_value', 'output']
        variable_check = batch[output_var_list]
        if len(parts) == 2:
            left, right=parts
            variable_check['transcription_error'] =  None
            variable_check[f'correct_{left}'] = None
            variable_check[f'correct_{right}'] = None
        else:
            variable_check['transcription_error'] =  None
            variable_check[f'correct_{var}'] = None
        variable_check.to_csv(f"check/1880/1880_general_schedule/{var}_check_1880_general_schedule.csv", index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  variable_check['transcription_error'] =  None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  variable_check[f'correct_{var}'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  variable_check['transcription_error'] =  None
A value is trying to be set on a copy of a slice from a DataFrame.
Try us