In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline   
from sklearn.ensemble import RandomForestClassifier, StackingClassifier,  BaggingClassifier
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("csv/train.csv", index_col=0)
test = pd.read_csv("csv/test.csv", index_col=0)
sub = pd.read_csv("csv/sample_submission.csv", index_col=0)

## Adding and removing features

In [3]:
def ft(data):
    data["hemoglobin_height"] = data["hemoglobin"] * data["height(cm)"]
    data["hemoglobin / Gtp"] = data["hemoglobin"] / data["Gtp"]
    data['BMI'] = data['weight(kg)'] / ((data['height(cm)'] / 100) ** 2)
    data['hemoglobin_status'] = [ 1 if ((age < 18 and 11.5 <= hgb <= 15.5) or (18 <= age <= 65 and 12.1 <= hgb <= 17.2) or (age > 65 and 11.7 <= hgb <= 14.9)) else 0 for age, hgb in zip(data['age'], data['hemoglobin'])]
    data['ALT_binned_quantile'] = pd.cut(data['ALT'], bins=4, labels=[1, 2, 3, 4])
    data['BMI_category'] = pd.cut(data['BMI'], bins=[0, 18.5, 25, 30, np.inf], labels = [1, 2, 3, 4])
    data['age_height'] = data['age'] * data['height(cm)']
            
ft(df)
ft(test)

test = test.drop(["age","relaxation", "fasting blood sugar", "Cholesterol"], axis=1)  
X = df.drop(["smoking", "age","relaxation", "fasting blood sugar", "Cholesterol"], axis=1)  
y = df["smoking"]

## Modelling

In [None]:
model1 = RandomForestClassifier(random_state=42,
                                max_depth = 16,
                                n_estimators=191, 
                                min_samples_split = 10, 
                                min_samples_leaf = 4, 
                                bootstrap= True,  
                                max_features = 'log2', 
                                criterion='entropy')

model2 = RandomForestClassifier(criterion='entropy', 
                                max_depth=30,
                                min_samples_leaf=7,
                                min_samples_split=21,
                                max_features = 'log2',
                                n_estimators=499,
                                bootstrap = False,
                                random_state=42)

bagging_model1 = BaggingClassifier(estimator=model1,
                                   n_estimators=50,
                                   random_state=42,
                                   bootstrap_features=True,
                                   n_jobs = -1)

bagging_model2 = BaggingClassifier(estimator=model2, 
                                   n_estimators=15, 
                                   random_state=42, 
                                   n_jobs = -1)

base_model = Pipeline([
                    ("poly", PolynomialFeatures(degree=2, include_bias=False)),
                    ("scaler", StandardScaler()),
                    ("logic", LogisticRegression(penalty="elasticnet", 
                                                solver="saga", 
                                                C=67.71250104715932, 
                                                l1_ratio=0.2318363725602379))])

stacking = StackingClassifier(estimators=[
                                        ('bagging1', bagging_model1),
                                        ('bagging2', bagging_model2)
                                        ],
                            final_estimator= base_model,
                            cv=5, 
                            n_jobs = -1)

stacking.fit(X, y)

In [16]:
sub["smoking"] = stacking.predict_proba(test)[:, 1]
sub.to_csv("return5.csv")