In [94]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import roc_auc_score

# Load the train DataFrame
train_df = pd.read_csv('train.csv')

# Identify categorical features
categorical_features = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

# Fill missing values in train_df
train_df['bmi'] = train_df['bmi'].fillna(train_df['bmi'].mean())

# Feature Engineering
train_df['age_bmi'] = train_df['age'] * train_df['bmi']

# Split the dataset into train and test sets
X = train_df.drop(['id', 'stroke'], axis=1)
y = train_df['stroke']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Get categorical features indices
categorical_features_indices = [X_train.columns.get_loc(col) for col in categorical_features]

# Create the CatBoostClassifier
model = CatBoostClassifier(random_seed=42, logging_level='Silent')

# Set up hyperparameter search
param_dist = {
    'iterations': [500, 1000, 1500],
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [4, 6, 8],
    'l2_leaf_reg': [1, 3, 5],
}

# Perform RandomizedSearchCV to find the best hyperparameters
random_search = RandomizedSearchCV(model, param_dist, n_iter=10, scoring='roc_auc', n_jobs=-1, cv=5, random_state=42)
random_search.fit(X_train, y_train, cat_features=categorical_features_indices)

# Train the model with the best hyperparameters
best_params = random_search.best_params_
best_model = CatBoostClassifier(**best_params, random_seed=42, logging_level='Silent')
best_model.fit(X_train, y_train, cat_features=categorical_features_indices)

# Evaluate AUC-ROC on the test set
y_pred_proba_test = best_model.predict_proba(X_test)[:, 1]
auc_roc = roc_auc_score(y_test, y_pred_proba_test)

print("AUC-ROC:", auc_roc)

# Load the test DataFrame
test_df = pd.read_csv('test.csv')

# Fill missing values in test_df
test_df['bmi'] = test_df['bmi'].fillna(test_df['bmi'].mean())

# Feature Engineering for test_df
test_df['age_bmi'] = test_df['age'] * test_df['bmi']

# Make predictions on the test DataFrame
y_pred_proba_train = best_model.predict_proba(test_df.drop('id', axis=1))[:, 1]

# y_pred_proba_train contains the predicted probabilities for the 'train.csv' DataFrame
print("Predicted Probabilities:", y_pred_proba_train)


AUC-ROC: 0.8881531637928772
Predicted Probabilities: [0.04227554 0.24051535 0.0006311  ... 0.00056158 0.00245121 0.00053415]


In [95]:
sub = pd.read_csv('sample_submission.csv')

In [96]:
sub['stroke'] = y_pred_proba_train

In [97]:
sub

Unnamed: 0,id,stroke
0,15304,0.042276
1,15305,0.240515
2,15306,0.000631
3,15307,0.056509
4,15308,0.003903
...,...,...
10199,25503,0.000717
10200,25504,0.016830
10201,25505,0.000562
10202,25506,0.002451


In [98]:
sub.to_csv('sub_working_auc_grid.csv', index=False)

In [101]:
check_loaded = pd.read_csv('sub_working_auc_grid.csv')
check_loaded.stroke

0        0.042276
1        0.240515
2        0.000631
3        0.056509
4        0.003903
           ...   
10199    0.000717
10200    0.016830
10201    0.000562
10202    0.002451
10203    0.000534
Name: stroke, Length: 10204, dtype: float64

In [102]:
check_loaded.stroke.values[0:1000]

array([4.22755354e-02, 2.40515354e-01, 6.31104005e-04, 5.65087148e-02,
       3.90304629e-03, 1.63353253e-02, 4.92288062e-03, 5.61662028e-02,
       5.33448102e-04, 2.80044868e-02, 1.61549335e-02, 1.49675368e-01,
       6.79774047e-04, 6.23580217e-03, 2.24765810e-02, 6.86016024e-04,
       6.99764520e-04, 2.24598142e-03, 2.63804411e-02, 3.29305991e-02,
       3.43342100e-03, 5.87738094e-04, 5.13170492e-04, 1.14114788e-02,
       1.55469179e-02, 1.78404363e-02, 1.56576056e-03, 4.09588429e-02,
       6.14585813e-04, 1.88824431e-02, 1.73970445e-01, 4.03266689e-03,
       7.04285325e-03, 1.25426783e-01, 5.45514020e-02, 5.42285139e-04,
       1.03792489e-01, 1.85219753e-02, 1.94244311e-03, 2.14673718e-03,
       7.70720515e-04, 1.87976331e-03, 5.20458875e-04, 4.96214839e-02,
       3.80272010e-02, 5.06537149e-04, 3.44450848e-02, 6.91137138e-04,
       2.15165243e-01, 1.73551500e-03, 5.65192411e-04, 6.68497807e-04,
       6.14748246e-04, 3.12513179e-03, 2.78648122e-01, 1.04049388e-03,
      

In [93]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import roc_auc_score

# Load the train DataFrame
train_df = pd.read_csv('train.csv')

# Identify categorical features
categorical_features = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

# Fill missing values in train_df
train_df['bmi'] = train_df['bmi'].fillna(train_df['bmi'].mean())

# Feature Engineering
train_df['age_bmi'] = train_df['age'] * train_df['bmi']

# Split the dataset into train and test sets
X = train_df.drop(['id', 'stroke'], axis=1)
y = train_df['stroke']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Get categorical features indices
categorical_features_indices = [X_train.columns.get_loc(col) for col in categorical_features]

# Create the CatBoostClassifier
model = CatBoostClassifier(random_seed=42, logging_level='Silent')

# Set up hyperparameter search
param_dist = {
    'iterations': [500, 1000, 1500],
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [4, 6, 8],
    'l2_leaf_reg': [1, 3, 5],
}

# Perform RandomizedSearchCV to find the best hyperparameters
random_search = RandomizedSearchCV(model, param_dist, n_iter=10, scoring='roc_auc', n_jobs=-1, cv=5, random_state=42)
random_search.fit(X_train, y_train, cat_features=categorical_features_indices)

# Train the model with the best hyperparameters
best_params = random_search.best_params_
best_model = CatBoostClassifier(**best_params, random_seed=42, logging_level='Silent')
best_model.fit(X_train, y_train, cat_features=categorical_features_indices)

# Evaluate AUC-ROC on the test set
y_pred_proba_test = best_model.predict_proba(X_test)[:, 1]
auc_roc = roc_auc_score(y_test, y_pred_proba_test)

print("AUC-ROC:", auc_roc)

AUC-ROC: 0.8881531637928772
