In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import RFECV
from sklearn.metrics import accuracy_score, roc_auc_score

# Tasodifiy urug' (SEED) aniqlash
SEED = 1

# Ma'lumotlar yuklash
data = pd.read_csv('train.csv').dropna(subset=['id'])

# Feature Engineering: yangi ustunlar yaratish
data['BMI'] = data['weight(kg)'] / (data['height(cm)'] / 100) ** 2  # BMI
data['Height_to_Waist_Ratio'] = data['height(cm)'] / data['waist(cm)']  # Bo'y va bel nisbat
data['Weight_to_Waist_Ratio'] = data['weight(kg)'] / data['waist(cm)']  # Vazn va bel nisbat
data['Eyesight_Difference'] = abs(data['eyesight(left)'] - data['eyesight(right)'])  # Ko'rish farqi
data['Hearing_Difference'] = abs(data['hearing(left)'] - data['hearing(right)'])  # Eshitish farqi
data['Blood_Pressure_Difference'] = abs(data['systolic'] - data['waist(cm)'])  # Qon bosimi farqi
data['Total_Cholesterol'] = data['HDL'] + data['LDL']  # Umumiy xolesterin
data['Hemoglobin_Serum_Ratio'] = data['hemoglobin'] / data['serum creatinine']  # Gemoglobin va zardob nisbati
data['ALT_AST_Ratio'] = data['ALT'] / data['AST']  # Jigar fermentlari nisbati
data['GTP_ALT_Ratio'] = data['Gtp'] / data['ALT']  # Gtp va ALT fermentlari nisbati
data['log_weight'] = np.log1p(data['weight(kg)'])  # Vaznning logaritmasi
data['interaction_1'] = data['height(cm)'] * data['BMI']  # O'zaro ta'sir xususiyati


# Target (label) va features (xususiyatlar) ni ajratamiz
X = data.drop(columns=['id', 'smoking'])  # 'smoking' - target, 'id'ni olib tashlaymiz
y = data['smoking']

# Ma'lumotlarni o'qitish va test uchun ajratamiz
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=SEED)

# Decision Tree modelini BaggingClassifier bilan yaratamiz
dt = DecisionTreeClassifier(criterion='gini',
                            splitter='best',
                            min_impurity_decrease=0.005,
                            max_depth=7,
                            min_samples_leaf=0.16,
                            random_state=SEED)
bc = BaggingClassifier(estimator=dt,
                        bootstrap=True,
                        n_estimators=379,
                        n_jobs=-1,
                        random_state=SEED)

# Modelni o'qitish
bc.fit(X_train, y_train)

# Bashorat qilish
y_pred = bc.predict(X_test)
y_pred_proba = bc.predict_proba(X_test)[:, 1]

# Model baholash
print(f"BaggingClassifier Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"BaggingClassifier ROC AUC: {roc_auc_score(y_test, y_pred_proba):.4f}")

# RandomForestClassifier modeli yaratamiz
rf = RandomForestClassifier(n_estimators=450, random_state=SEED, n_jobs=-1)

# RFECV yordamida xususiyatlarni tanlaymiz
rfecv = RFECV(estimator=rf, step=2, cv=5, scoring='roc_auc', n_jobs=-1)

# RFECV modelini o‘qitish
rfecv.fit(X_train, y_train)

# Tanlangan eng yaxshi xususiyatlar soni
print(f"Selected features: {rfecv.n_features_}")

# Tanlangan xususiyatlarga asoslangan yangi o‘quv va test ma'lumotlarini olish
X_train_rfe = rfecv.transform(X_train)
X_test_rfe = rfecv.transform(X_test)

# Yangi RandomForest modelini o‘qitish (eng yaxshi xususiyatlar bilan)
rf.fit(X_train_rfe, y_train)

# Test to‘plamida bashorat qilish
y_pred_rf = rf.predict(X_test_rfe)
y_pred_rf_proba = rf.predict_proba(X_test_rfe)[:, 1]

# Baholash (RandomForest)
print(f"Random Forest Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print(f"Random Forest ROC AUC: {roc_auc_score(y_test, y_pred_rf_proba):.4f}")

# Test ma'lumotlarini yuklash
df_test = pd.read_csv('test.csv').dropna(subset=['id'])


# Test to'plamda ham Feature Engineering qismida yangi ustunlarni yaratish kerak
df_test['BMI'] = df_test['weight(kg)'] / (df_test['height(cm)'] / 100) ** 2  # BMI
df_test['Height_to_Waist_Ratio'] = df_test['height(cm)'] / df_test['waist(cm)']  # Bo'y va bel nisbat
df_test['Weight_to_Waist_Ratio'] = df_test['weight(kg)'] / df_test['waist(cm)']  # Vazn va bel nisbat
df_test['Eyesight_Difference'] = abs(df_test['eyesight(left)'] - df_test['eyesight(right)'])  # Ko'rish farqi
df_test['Hearing_Difference'] = abs(df_test['hearing(left)'] - df_test['hearing(right)'])  # Eshitish farqi
df_test['Blood_Pressure_Difference'] = abs(df_test['systolic'] - df_test['waist(cm)'])  # Qon bosimi farqi
df_test['Total_Cholesterol'] = df_test['HDL'] + df_test['LDL']  # Umumiy xolesterin
df_test['Hemoglobin_Serum_Ratio'] = df_test['hemoglobin'] / df_test['serum creatinine']  # Gemoglobin va zardob nisbati
df_test['ALT_AST_Ratio'] = df_test['ALT'] / df_test['AST']  # Jigar fermentlari nisbati
df_test['GTP_ALT_Ratio'] = df_test['Gtp'] / df_test['ALT']  # Gtp va ALT fermentlari nisbati
df_test['log_weight'] = np.log1p(df_test['weight(kg)'])  # Vaznning logaritmasi
df_test['interaction_1'] = df_test['height(cm)'] * df_test['BMI']  # O'zaro ta'sir xususiyati


# Test ma'lumotlaridan eng yaxshi xususiyatlarni olish
X_test_final_rfe = rfecv.transform(df_test.drop(columns=['id']))

# Test to‘plamida ehtimollarni bashorat qilish (RandomForest bilan)
y_test_prob_rf = rf.predict_proba(X_test_final_rfe)[:, 1]

# Bashoratlarni saqlash
subm = pd.read_csv("sample_submission (2).csv")
subm['smoking'] = y_test_prob_rf
subm.to_csv("my_submission_tuning.csv", index=False)


BaggingClassifier Accuracy: 0.7567
BaggingClassifier ROC AUC: 0.8589
Selected features: 30
Random Forest Accuracy: 0.8090
Random Forest ROC AUC: 0.8904
