In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeRegressor, export_text
from sklearn.linear_model import LinearRegression
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


In [3]:
df = pd.read_csv('../data/dataproject2025.csv')
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df.head()

Unnamed: 0,issue_d,loan duration,annual_inc,avg_cur_bal,bc_open_to_buy,bc_util,delinq_2yrs,dti,emp_length,emp_title,fico_range_high,funded_amnt,grade,home_ownership,inq_last_6mths,int_rate,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,num_actv_bc_tl,num_bc_tl,num_il_tl,num_rev_accts,open_acc,pub_rec,pub_rec_bankruptcies,purpose,revol_bal,revol_util,sub_grade,target,tax_liens,zip_code,Pct_afro_american,Predictions,Predicted probabilities
0,2013,0,39600.0,1379.0,21564.0,16.1,0.0,2.49,2 years,other,759.0,4800.0,B,MORTGAGE,2.0,10.99,220.0,25.0,25.0,0.0,25.0,2.0,4.0,1.0,7.0,3.0,0.0,0.0,home_improvement,4136.0,16.1,B2,0,0.0,782,7.388592,0,0.053051
1,2013,0,55000.0,9570.0,16473.0,53.9,0.0,22.87,10+ years,other,734.0,27050.0,B,OWN,0.0,10.99,326.0,16.0,6.0,4.0,16.0,2.0,8.0,8.0,15.0,14.0,0.0,0.0,debt_consolidation,36638.0,61.2,B2,0,0.0,481,9.745456,0,0.084507
2,2013,0,325000.0,53306.0,13901.0,67.1,0.0,18.55,5 years,sales manager,749.0,28000.0,A,MORTGAGE,1.0,7.62,229.0,5.0,2.0,5.0,5.0,4.0,8.0,11.0,15.0,15.0,0.0,0.0,debt_consolidation,29581.0,54.6,A3,0,0.0,945,7.542862,0,0.037206
3,2013,0,130000.0,36362.0,3567.0,93.0,0.0,13.03,10+ years,other,719.0,12000.0,B,MORTGAGE,1.0,11.99,193.0,4.0,4.0,3.0,85.0,3.0,4.0,8.0,8.0,9.0,0.0,0.0,debt_consolidation,10805.0,67.0,B3,0,0.0,809,6.598132,0,0.061371
4,2013,1,73000.0,24161.0,4853.0,74.7,1.0,23.13,6 years,other,669.0,27600.0,D,MORTGAGE,1.0,19.97,294.0,4.0,4.0,4.0,4.0,5.0,11.0,4.0,16.0,10.0,0.0,0.0,debt_consolidation,27003.0,82.8,D5,1,0.0,802,7.0589,1,0.345896


In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1086236 entries, 0 to 1086235
Data columns (total 38 columns):
 #   Column                   Non-Null Count    Dtype  
---  ------                   --------------    -----  
 0   issue_d                  1086236 non-null  int64  
 1   loan duration            1086236 non-null  int64  
 2   annual_inc               1086236 non-null  float64
 3   avg_cur_bal              1086236 non-null  float64
 4   bc_open_to_buy           1086236 non-null  float64
 5   bc_util                  1086236 non-null  float64
 6   delinq_2yrs              1086236 non-null  float64
 7   dti                      1086236 non-null  float64
 8   emp_length               1086236 non-null  object 
 9   emp_title                1086236 non-null  object 
 10  fico_range_high          1086236 non-null  float64
 11  funded_amnt              1086236 non-null  float64
 12  grade                    1086236 non-null  object 
 13  home_ownership           1086236 non-null 

In [45]:
df.isna().sum()

issue_d                    0
loan duration              0
annual_inc                 0
avg_cur_bal                0
bc_open_to_buy             0
bc_util                    0
delinq_2yrs                0
dti                        0
emp_length                 0
emp_title                  0
fico_range_high            0
funded_amnt                0
grade                      0
home_ownership             0
inq_last_6mths             0
int_rate                   0
mo_sin_old_rev_tl_op       0
mo_sin_rcnt_rev_tl_op      0
mo_sin_rcnt_tl             0
mort_acc                   0
mths_since_recent_bc       0
num_actv_bc_tl             0
num_bc_tl                  0
num_il_tl                  0
num_rev_accts              0
open_acc                   0
pub_rec                    0
pub_rec_bankruptcies       0
purpose                    0
revol_bal                  0
revol_util                 0
sub_grade                  0
target                     0
tax_liens                  0
zip_code      

In [48]:
# Target and features
y = df['Predicted probabilities']
X = df.drop(columns=['Predicted probabilities', 'Predictions', 'target'])
print(X.shape)  
print(y.shape)

(1086236, 35)
(1086236,)


In [49]:

# Identify column types
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
print(numeric_cols)
print(categorical_cols)



['issue_d', 'loan duration', 'annual_inc', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util', 'delinq_2yrs', 'dti', 'fico_range_high', 'funded_amnt', 'inq_last_6mths', 'int_rate', 'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl', 'mort_acc', 'mths_since_recent_bc', 'num_actv_bc_tl', 'num_bc_tl', 'num_il_tl', 'num_rev_accts', 'open_acc', 'pub_rec', 'pub_rec_bankruptcies', 'revol_bal', 'revol_util', 'tax_liens', 'zip_code', 'Pct_afro_american']
['emp_length', 'emp_title', 'grade', 'home_ownership', 'purpose', 'sub_grade']


In [50]:
# Preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

In [51]:
# Fit and transform
X_transformed = preprocessor.fit_transform(X)
feature_names = preprocessor.get_feature_names_out()
X_transformed_df = pd.DataFrame(X_transformed.toarray() if hasattr(X_transformed, "toarray") else X_transformed,
                                columns=feature_names, index=X.index)
X_transformed_df.head()

Unnamed: 0,num__issue_d,num__loan duration,num__annual_inc,num__avg_cur_bal,num__bc_open_to_buy,num__bc_util,num__delinq_2yrs,num__dti,num__fico_range_high,num__funded_amnt,num__inq_last_6mths,num__int_rate,num__mo_sin_old_rev_tl_op,num__mo_sin_rcnt_rev_tl_op,num__mo_sin_rcnt_tl,num__mort_acc,num__mths_since_recent_bc,num__num_actv_bc_tl,num__num_bc_tl,num__num_il_tl,num__num_rev_accts,num__open_acc,num__pub_rec,num__pub_rec_bankruptcies,num__revol_bal,num__revol_util,num__tax_liens,num__zip_code,num__Pct_afro_american,cat__emp_length_1 year,cat__emp_length_10+ years,cat__emp_length_2 years,cat__emp_length_3 years,cat__emp_length_4 years,cat__emp_length_5 years,cat__emp_length_6 years,cat__emp_length_7 years,cat__emp_length_8 years,cat__emp_length_9 years,cat__emp_length_< 1 year,cat__emp_title_account executive,cat__emp_title_account manager,cat__emp_title_accountant,cat__emp_title_administrative assistant,cat__emp_title_administrator,cat__emp_title_analyst,cat__emp_title_assistant manager,cat__emp_title_associate,cat__emp_title_attorney,cat__emp_title_branch manager,cat__emp_title_business analyst,cat__emp_title_chief executive officer,cat__emp_title_clerk,cat__emp_title_consultant,cat__emp_title_controller,cat__emp_title_customer service,cat__emp_title_customer service representative,cat__emp_title_director,cat__emp_title_driver,cat__emp_title_electrician,cat__emp_title_engineer,cat__emp_title_executive assistant,cat__emp_title_financial analyst,cat__emp_title_foreman,cat__emp_title_general manager,cat__emp_title_machine operator,cat__emp_title_maintenance,cat__emp_title_manager,cat__emp_title_mechanic,cat__emp_title_nurse,cat__emp_title_office manager,cat__emp_title_operations manager,cat__emp_title_operator,cat__emp_title_other,cat__emp_title_owner,cat__emp_title_paralegal,cat__emp_title_police officer,cat__emp_title_president,cat__emp_title_program manager,cat__emp_title_project manager,cat__emp_title_sales,cat__emp_title_sales manager,cat__emp_title_server,cat__emp_title_software engineer,cat__emp_title_store manager,cat__emp_title_superintendent,cat__emp_title_supervisor,cat__emp_title_teacher,cat__emp_title_technician,cat__emp_title_truck driver,cat__emp_title_vice president,cat__grade_A,cat__grade_B,cat__grade_C,cat__grade_D,cat__grade_E,cat__grade_F,cat__grade_G,cat__home_ownership_MORTGAGE,cat__home_ownership_OWN,cat__home_ownership_RENT,cat__purpose_car,cat__purpose_credit_card,cat__purpose_debt_consolidation,cat__purpose_home_improvement,cat__purpose_house,cat__purpose_major_purchase,cat__purpose_medical,cat__purpose_moving,cat__purpose_other,cat__purpose_renewable_energy,cat__purpose_small_business,cat__purpose_vacation,cat__purpose_wedding,cat__sub_grade_A1,cat__sub_grade_A2,cat__sub_grade_A3,cat__sub_grade_A4,cat__sub_grade_A5,cat__sub_grade_B1,cat__sub_grade_B2,cat__sub_grade_B3,cat__sub_grade_B4,cat__sub_grade_B5,cat__sub_grade_C1,cat__sub_grade_C2,cat__sub_grade_C3,cat__sub_grade_C4,cat__sub_grade_C5,cat__sub_grade_D1,cat__sub_grade_D2,cat__sub_grade_D3,cat__sub_grade_D4,cat__sub_grade_D5,cat__sub_grade_E1,cat__sub_grade_E2,cat__sub_grade_E3,cat__sub_grade_E4,cat__sub_grade_E5,cat__sub_grade_F1,cat__sub_grade_F2,cat__sub_grade_F3,cat__sub_grade_F4,cat__sub_grade_F5,cat__sub_grade_G1,cat__sub_grade_G2,cat__sub_grade_G3,cat__sub_grade_G4,cat__sub_grade_G5
0,-1.687635,-0.637761,-0.740512,-0.745082,0.749479,-1.586997,-0.370384,-1.880644,2.10613,-1.180039,1.439967,-0.588881,0.485989,0.741333,2.006288,-0.818502,0.062627,-0.76491,-0.835455,-1.049239,-0.905264,-1.5779,-0.384455,-0.359252,-0.591643,-1.505575,-0.156755,0.860517,-0.4572,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-1.687635,-0.637761,-0.470965,-0.250508,0.416937,-0.242629,-0.370384,0.530276,1.258044,1.180387,-0.69472,-0.588881,1.643008,0.185651,-0.199909,1.221235,-0.245818,-0.76491,0.007319,-0.094443,0.081095,0.381078,-0.384455,-0.359252,0.842038,0.342899,-0.156755,-0.103767,-0.261925,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-1.687635,-0.637761,4.254853,2.390279,0.248935,0.226833,-0.370384,0.019227,1.766896,1.281169,0.372623,-1.282265,0.584226,-0.493516,-0.664371,1.73117,-0.622806,0.104352,0.007319,0.314755,0.081095,0.559167,-0.384455,-0.359252,0.53075,0.072391,-0.156755,1.382703,-0.444418,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-1.687635,-0.637761,0.841762,1.367198,-0.426078,1.147974,-0.370384,-0.633779,0.749193,-0.416216,0.372623,-0.383129,0.191276,-0.555258,-0.43214,0.711301,2.118925,-0.330279,-0.835455,-0.094443,-0.78197,-0.509366,-0.384455,-0.359252,-0.29747,0.580619,-0.156755,0.947014,-0.522693,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-1.687635,1.567986,-0.15591,0.630499,-0.342077,0.497129,0.729341,0.561034,-0.946979,1.238734,0.372623,1.25877,1.293719,-0.555258,-0.43214,1.221235,-0.657077,0.538983,0.639399,-0.640041,0.20439,-0.331277,-0.384455,-0.359252,0.417033,1.2282,-0.156755,0.924589,-0.484517,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [53]:
df_preprocessed = pd.concat([X_transformed_df, y.reset_index(drop=True)], axis=1)
df_preprocessed.head()

Unnamed: 0,num__issue_d,num__loan duration,num__annual_inc,num__avg_cur_bal,num__bc_open_to_buy,num__bc_util,num__delinq_2yrs,num__dti,num__fico_range_high,num__funded_amnt,num__inq_last_6mths,num__int_rate,num__mo_sin_old_rev_tl_op,num__mo_sin_rcnt_rev_tl_op,num__mo_sin_rcnt_tl,num__mort_acc,num__mths_since_recent_bc,num__num_actv_bc_tl,num__num_bc_tl,num__num_il_tl,num__num_rev_accts,num__open_acc,num__pub_rec,num__pub_rec_bankruptcies,num__revol_bal,num__revol_util,num__tax_liens,num__zip_code,num__Pct_afro_american,cat__emp_length_1 year,cat__emp_length_10+ years,cat__emp_length_2 years,cat__emp_length_3 years,cat__emp_length_4 years,cat__emp_length_5 years,cat__emp_length_6 years,cat__emp_length_7 years,cat__emp_length_8 years,cat__emp_length_9 years,cat__emp_length_< 1 year,cat__emp_title_account executive,cat__emp_title_account manager,cat__emp_title_accountant,cat__emp_title_administrative assistant,cat__emp_title_administrator,cat__emp_title_analyst,cat__emp_title_assistant manager,cat__emp_title_associate,cat__emp_title_attorney,cat__emp_title_branch manager,cat__emp_title_business analyst,cat__emp_title_chief executive officer,cat__emp_title_clerk,cat__emp_title_consultant,cat__emp_title_controller,cat__emp_title_customer service,cat__emp_title_customer service representative,cat__emp_title_director,cat__emp_title_driver,cat__emp_title_electrician,cat__emp_title_engineer,cat__emp_title_executive assistant,cat__emp_title_financial analyst,cat__emp_title_foreman,cat__emp_title_general manager,cat__emp_title_machine operator,cat__emp_title_maintenance,cat__emp_title_manager,cat__emp_title_mechanic,cat__emp_title_nurse,cat__emp_title_office manager,cat__emp_title_operations manager,cat__emp_title_operator,cat__emp_title_other,cat__emp_title_owner,cat__emp_title_paralegal,cat__emp_title_police officer,cat__emp_title_president,cat__emp_title_program manager,cat__emp_title_project manager,cat__emp_title_sales,cat__emp_title_sales manager,cat__emp_title_server,cat__emp_title_software engineer,cat__emp_title_store manager,cat__emp_title_superintendent,cat__emp_title_supervisor,cat__emp_title_teacher,cat__emp_title_technician,cat__emp_title_truck driver,cat__emp_title_vice president,cat__grade_A,cat__grade_B,cat__grade_C,cat__grade_D,cat__grade_E,cat__grade_F,cat__grade_G,cat__home_ownership_MORTGAGE,cat__home_ownership_OWN,cat__home_ownership_RENT,cat__purpose_car,cat__purpose_credit_card,cat__purpose_debt_consolidation,cat__purpose_home_improvement,cat__purpose_house,cat__purpose_major_purchase,cat__purpose_medical,cat__purpose_moving,cat__purpose_other,cat__purpose_renewable_energy,cat__purpose_small_business,cat__purpose_vacation,cat__purpose_wedding,cat__sub_grade_A1,cat__sub_grade_A2,cat__sub_grade_A3,cat__sub_grade_A4,cat__sub_grade_A5,cat__sub_grade_B1,cat__sub_grade_B2,cat__sub_grade_B3,cat__sub_grade_B4,cat__sub_grade_B5,cat__sub_grade_C1,cat__sub_grade_C2,cat__sub_grade_C3,cat__sub_grade_C4,cat__sub_grade_C5,cat__sub_grade_D1,cat__sub_grade_D2,cat__sub_grade_D3,cat__sub_grade_D4,cat__sub_grade_D5,cat__sub_grade_E1,cat__sub_grade_E2,cat__sub_grade_E3,cat__sub_grade_E4,cat__sub_grade_E5,cat__sub_grade_F1,cat__sub_grade_F2,cat__sub_grade_F3,cat__sub_grade_F4,cat__sub_grade_F5,cat__sub_grade_G1,cat__sub_grade_G2,cat__sub_grade_G3,cat__sub_grade_G4,cat__sub_grade_G5,Predicted probabilities
0,-1.687635,-0.637761,-0.740512,-0.745082,0.749479,-1.586997,-0.370384,-1.880644,2.10613,-1.180039,1.439967,-0.588881,0.485989,0.741333,2.006288,-0.818502,0.062627,-0.76491,-0.835455,-1.049239,-0.905264,-1.5779,-0.384455,-0.359252,-0.591643,-1.505575,-0.156755,0.860517,-0.4572,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.053051
1,-1.687635,-0.637761,-0.470965,-0.250508,0.416937,-0.242629,-0.370384,0.530276,1.258044,1.180387,-0.69472,-0.588881,1.643008,0.185651,-0.199909,1.221235,-0.245818,-0.76491,0.007319,-0.094443,0.081095,0.381078,-0.384455,-0.359252,0.842038,0.342899,-0.156755,-0.103767,-0.261925,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.084507
2,-1.687635,-0.637761,4.254853,2.390279,0.248935,0.226833,-0.370384,0.019227,1.766896,1.281169,0.372623,-1.282265,0.584226,-0.493516,-0.664371,1.73117,-0.622806,0.104352,0.007319,0.314755,0.081095,0.559167,-0.384455,-0.359252,0.53075,0.072391,-0.156755,1.382703,-0.444418,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.037206
3,-1.687635,-0.637761,0.841762,1.367198,-0.426078,1.147974,-0.370384,-0.633779,0.749193,-0.416216,0.372623,-0.383129,0.191276,-0.555258,-0.43214,0.711301,2.118925,-0.330279,-0.835455,-0.094443,-0.78197,-0.509366,-0.384455,-0.359252,-0.29747,0.580619,-0.156755,0.947014,-0.522693,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.061371
4,-1.687635,1.567986,-0.15591,0.630499,-0.342077,0.497129,0.729341,0.561034,-0.946979,1.238734,0.372623,1.25877,1.293719,-0.555258,-0.43214,1.221235,-0.657077,0.538983,0.639399,-0.640041,0.20439,-0.331277,-0.384455,-0.359252,0.417033,1.2282,-0.156755,0.924589,-0.484517,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.345896


In [54]:
df_preprocessed.shape

(1086236, 150)

In [None]:
df_preprocessed.to_csv('../data/dataproject2025_preprocessed.csv', index=False)