In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler, TargetEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, HistGradientBoostingClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.linear_model import  LogisticRegression

In [2]:
# Load train and test data
train_data = pd.read_csv('final_proj_data.csv')
test_data = pd.read_csv('final_proj_test.csv')

In [3]:
# example for test_prediction
sub = pd.read_csv('final_proj_sample_submission.csv')
sub

Unnamed: 0,index,y
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
...,...,...
2495,2495,0
2496,2496,0
2497,2497,0
2498,2498,0


In [4]:
# Train data
train_data

Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,Var10,...,Var222,Var223,Var224,Var225,Var226,Var227,Var228,Var229,Var230,y
0,,,,,,812.0,14.0,,,,...,catzS2D,jySVZNlOJy,,xG3x,Aoh3,ZI9m,ib5G6X1eUxUn6,mj86,,0
1,,,,,,2688.0,7.0,,,,...,i06ocsg,LM8l689qOp,,kG3k,WqMG,RAYp,55YFVY9,mj86,,0
2,,,,,,1015.0,14.0,,,,...,P6pu4Vl,LM8l689qOp,,kG3k,Aoh3,ZI9m,R4y5gQQWY8OodqDV,am7c,,0
3,,,,,,168.0,0.0,,,,...,BNrD3Yd,LM8l689qOp,,,FSa2,RAYp,F2FyR07IdsN7I,,,0
4,,,,,,14.0,0.0,,,,...,3B1QowC,LM8l689qOp,,,WqMG,RAYp,F2FyR07IdsN7I,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,,,,,,714.0,7.0,,,,...,CE7uk3u,jySVZNlOJy,,,FSa2,RAYp,F2FyR07IdsN7I,,,0
9996,0.0,,,,,,,,24.0,,...,UpJuOS_,LM8l689qOp,,,TNEC,6fzt,F2FcTt7IdMT_v,,,0
9997,,,,,,1526.0,7.0,,,,...,Ff09Jxo,LM8l689qOp,,,FSa2,RAYp,F2FyR07IdsN7I,,,0
9998,,,,,,1267.0,14.0,,,,...,catzS2D,LM8l689qOp,,kG3k,453m,ZI9m,TCU50_Yjmm6GIBZ0lL_,am7c,,0


In [5]:
train_data.describe()

Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,Var10,...,Var184,Var185,Var186,Var187,Var188,Var189,Var190,Var209,Var230,y
count,133.0,266.0,266.0,280.0,241.0,8980.0,8995.0,0.0,133.0,241.0,...,266.0,0.0,133.0,133.0,266.0,4206.0,43.0,0.0,0.0,10000.0
mean,14.977444,0.0,341.052632,0.096429,233810.1,1340.916258,6.8607,,61.383459,367294.3,...,6.180451,,2.977444,20.601504,159.107368,272.455064,25725.112326,,,0.1305
std,66.456008,0.0,2810.606975,0.928243,553230.5,2380.516758,6.300994,,266.124849,823421.5,...,12.177204,,10.329764,93.736247,115.766972,86.752531,37487.484852,,,0.33687
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,...,0.0,,0.0,0.0,0.0,6.0,0.0,,,0.0
25%,0.0,0.0,0.0,0.0,0.0,523.25,0.0,,2.0,0.0,...,0.0,,0.0,0.0,18.84,204.0,1312.875,,,0.0
50%,0.0,0.0,0.0,0.0,0.0,861.0,7.0,,18.0,0.0,...,0.0,,0.0,4.0,194.67,270.0,10853.82,,,0.0
75%,16.0,0.0,0.0,0.0,117235.0,1428.0,7.0,,40.0,243936.0,...,7.0,,0.0,12.0,247.08,330.0,37491.525,,,0.0
max,680.0,0.0,42588.0,9.0,3024000.0,76195.0,35.0,,2300.0,6394806.0,...,64.0,,102.0,878.0,452.76,642.0,191167.2,,,1.0


In [6]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Columns: 231 entries, Var1 to y
dtypes: float64(191), int64(2), object(38)
memory usage: 17.6+ MB


In [7]:
train_data.isna().sum()

Var1       9867
Var2       9734
Var3       9734
Var4       9720
Var5       9759
          ...  
Var227        0
Var228        0
Var229     5561
Var230    10000
y             0
Length: 231, dtype: int64

In [8]:
train_data = train_data[train_data.columns[train_data.isna().mean().lt(0.35)]]

In [9]:
train_data.isna().sum()

Var6      1020
Var7      1005
Var13     1005
Var21     1020
Var22      920
          ... 
Var223    1013
Var226       0
Var227       0
Var228       0
y            0
Length: 68, dtype: int64

In [10]:
# Test data
test_data

Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,Var10,...,Var221,Var222,Var223,Var224,Var225,Var226,Var227,Var228,Var229,Var230
0,,,,,,819.0,7.0,,,,...,zCkv,catzS2D,LM8l689qOp,,ELof,rgKb,ZI9m,ib5G6X1eUxUn6,am7c,
1,,,,,,3192.0,28.0,,,,...,oslk,QkgQQMs,LM8l689qOp,,,453m,RAYp,F2FyR07IdsN7I,am7c,
2,,,,,,756.0,0.0,,,,...,oslk,bxCQb98,jySVZNlOJy,,,Qu4f,RAYp,F2FyR07IdsN7I,,
3,,,,,,3892.0,21.0,,,,...,oslk,0JeRt72,M_8D,,ELof,Aoh3,RAYp,F2FyR07IdsN7I,mj86,
4,,,,,,672.0,7.0,,,,...,oslk,0p8OTZB,LM8l689qOp,,ELof,xb3V,RAYp,F2FyR07IdsN7I,mj86,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2495,,,,,,700.0,7.0,,,,...,oslk,Pbo79_c,,,,7aLG,RAYp,F2FyR07IdsN7I,,
2496,,,,,,1421.0,7.0,,,,...,oslk,853QZmh,LM8l689qOp,,ELof,fKCe,RAYp,F2FyR07IdsN7I,,
2497,,,,,,1372.0,7.0,,,,...,oslk,0p8OTZB,LM8l689qOp,,,xb3V,RAYp,F2FyR07IdsN7I,,
2498,,,,,,861.0,7.0,,,,...,oslk,QuuQ5_S,M_8D,,,WqMG,RAYp,F2FyR07IdsN7I,,


In [11]:
test_data.describe()

Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,Var10,...,Var183,Var184,Var185,Var186,Var187,Var188,Var189,Var190,Var209,Var230
count,38.0,78.0,78.0,72.0,61.0,2229.0,2228.0,0.0,38.0,61.0,...,78.0,78.0,0.0,38.0,38.0,78.0,1077.0,11.0,0.0,0.0
mean,9.263158,0.0,820.461538,0.25,287686.3,1309.423957,6.871185,,46.263158,286057.9,...,98765.92,5.74359,,2.368421,19.263158,177.723846,270.239554,23779.930909,,
std,11.544706,0.0,3470.654157,1.489399,844897.0,2655.285857,6.328531,,66.534561,783374.4,...,224301.3,9.894818,,6.322081,33.619239,116.309592,88.919402,33518.7306,,
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,...,0.0,0.0,,0.0,0.0,0.0,36.0,456.03,,
25%,0.0,0.0,0.0,0.0,0.0,490.0,0.0,,8.5,0.0,...,0.0,0.0,,0.0,2.0,19.92,204.0,2079.63,,
50%,8.0,0.0,0.0,0.0,0.0,854.0,7.0,,27.0,0.0,...,0.0,0.0,,0.0,9.0,205.77,270.0,3529.35,,
75%,16.0,0.0,0.0,0.0,133810.0,1435.0,7.0,,55.5,5004.0,...,77281.0,8.0,,0.0,21.5,272.325,330.0,30532.05,,
max,40.0,0.0,22461.0,9.0,4397645.0,91126.0,35.0,,358.0,4665600.0,...,1335952.0,48.0,,36.0,154.0,362.52,642.0,101984.4,,


In [12]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Columns: 230 entries, Var1 to Var230
dtypes: float64(191), int64(1), object(38)
memory usage: 4.4+ MB


In [13]:
test_data.isna().sum()

Var1      2462
Var2      2422
Var3      2422
Var4      2428
Var5      2439
          ... 
Var226       0
Var227       0
Var228       0
Var229    1423
Var230    2500
Length: 230, dtype: int64

In [14]:
test_data = test_data[test_data.columns[test_data.isna().mean().lt(0.35)]]

In [15]:
test_data.isna().sum()

Var6      271
Var7      272
Var13     272
Var21     271
Var22     249
         ... 
Var222      0
Var223    251
Var226      0
Var227      0
Var228      0
Length: 67, dtype: int64

In [16]:
# Split for train and test
X_train = train_data.drop(columns=['y'])
y_train = train_data['y']
X_test = test_data

In [17]:
# categorical and numeric
cat_features = X_train.select_dtypes(include='object').columns
num_features = X_train.select_dtypes(include=np.number).columns

In [18]:
# Impute missing values in numeric features
num_imputer = SimpleImputer(strategy='mean')
X_train_num_imputed = num_imputer.fit_transform(X_train[num_features])
X_test_num_imputed = num_imputer.transform(X_test[num_features])

In [19]:
# Standardize numeric features
scaler = StandardScaler()
X_train_num_scaled = scaler.fit_transform(X_train_num_imputed)
X_test_num_scaled = scaler.transform(X_test_num_imputed)

In [20]:
# Impute missing values in categorical features
cat_imputer = SimpleImputer(strategy='most_frequent')
X_train_cat_imputed = cat_imputer.fit_transform(X_train[cat_features])
X_test_cat_imputed = cat_imputer.transform(X_test[cat_features])

In [21]:
# Encode categorical features using TargetEncoder
encoder = TargetEncoder()
X_train_cat_encoded = encoder.fit_transform(X_train[cat_features], y_train)
X_test_cat_encoded = encoder.transform(X_test[cat_features])

In [22]:
# Combine numeric and categorical features
X_train_processed = np.hstack((X_train_num_scaled, X_train_cat_encoded))
X_test_processed = np.hstack((X_test_num_scaled, X_test_cat_encoded))

Use GridSearchCV for search best params <br>
Results: <br>
Best Parameters: {'learning_rate': 0.2, 'max_depth': 3, 'max_iter': 200, 'min_samples_leaf': 100} <br>
Best Balanced Accuracy: 0.83 

In [23]:
# from sklearn.model_selection import GridSearchCV

# # Define a parameter grid to search
# param_grid = {
#     'max_iter': [100, 200, 300],
#     'learning_rate': [0.01, 0.1, 0.2],
#     'max_depth': [3, 5, 7],
#     'min_samples_leaf': [20, 50, 100]
# }

# # Initialize the model
# base_model = HistGradientBoostingClassifier(random_state=42)

# # Set up the grid search
# grid_search = GridSearchCV(estimator=base_model, param_grid=param_grid, scoring='balanced_accuracy', cv=5)
# grid_search.fit(X_train_processed, y_train)

# # Print the best parameters and the best score
# print(f"Best Parameters: {grid_search.best_params_}")
# print(f"Best Balanced Accuracy: {grid_search.best_score_:.2f}")


# # Best Parameters: {'learning_rate': 0.2, 'max_depth': 3, 'max_iter': 200, 'min_samples_leaf': 100}
# # Best Balanced Accuracy: 0.83


In [24]:
# Learn model and make predictions
estimators = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('gb', HistGradientBoostingClassifier(max_iter=200, learning_rate=0.2, max_depth=3, min_samples_leaf=100, random_state=42))
]

stacking_model = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
stacking_model.fit(X_train_processed, y_train)
y_test_pred = stacking_model.predict(X_test_processed)



Mean Balanced Accuracy preview

In [28]:
# # Mean Balanced Accuracy preview
# from sklearn.model_selection import StratifiedKFold, cross_val_score

# skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# scores = cross_val_score(stacking_model, X_train_processed, y_train, cv=skf, scoring='balanced_accuracy')
# print(f"Mean Balanced Accuracy: {scores.mean():.4f}")

Mean Balanced Accuracy: 0.8079


In [26]:
# Create a DataFrame for predictions 
test_predictions = pd.DataFrame({
    'index' : sub['index'],
    'y': y_test_pred
})

In [27]:
# Save to CSV
test_predictions.to_csv('test_predictions.csv', index=False)