In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder, PolynomialFeatures
from sklearn.metrics import roc_auc_score

# 1. Upload data
# 1. Ma'lumotlarni yuklash
df = pd.read_csv('train[1].csv')

# 2. Removing blank values ​​of surname
# 2. Surname bo'sh qiymatlarini olib tashlash
df = df.dropna(subset=['Surname'])

# 3. Coding categorical columns (Gender and Geography)
# 3. Kategorik ustunlarni kodlash (Gender va Geography)
label_encoder = LabelEncoder()
df['Gender'] = label_encoder.fit_transform(df['Gender'])

# Geography ustunini One-Hot Encoding qilish
df = pd.get_dummies(df, columns=['Geography'], drop_first=True)

# 4. Yangi xususiyatlar yaratish (feature engineering)
df['Age_to_NumOfProducts'] = df['Age'] / df['NumOfProducts']
df['Balance_Tenure'] = df['Balance'] * df['Tenure']
df['Age_NumOfProducts'] = df['Age'] * df['NumOfProducts']
df['Balance_to_CreditScore'] = df['Balance'] / df['CreditScore']
df['Balance_to_NumOfProducts'] = df['Balance'] / df['NumOfProducts']
df['Balance_to_Salary'] = df['Balance'] / df['EstimatedSalary']
df['Tenure_to_Age'] = df['Tenure'] / df['Age']
df['CreditScore_to_Age'] = df['CreditScore'] / df['Age']
df['Balance_NumOfProducts'] = df['Balance'] * df['NumOfProducts']
df['CreditScore_IsActive'] = df['CreditScore'] * df['IsActiveMember']

# 5. Polynomial features yaratish
poly = PolynomialFeatures(degree=2, interaction_only=True)
poly_features = poly.fit_transform(df[['CreditScore', 'Age', 'Balance']])

# Polynomial featuresni dataframega qo'shish
poly_df = pd.DataFrame(poly_features, columns=[f'poly_{i}' for i in range(poly_features.shape[1])])
df = pd.concat([df, poly_df], axis=1)

# 6. Maqsadli ustun va xususiyatlarni ajratish
X = df.drop(columns=['Exited', 'Surname', 'CustomerId', 'id'])  # Maqsadli ustun va keraksiz ustunlarni olib tashlaymiz
y = df['Exited']

# 7. Ma'lumotlarni masshtablash
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 8. Trening va test ma'lumotlarini bo'lish
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# 9. StackingClassifier yaratish (Logistic Regression va Ridge Classifier bilan)
estimators = [
    ('logistic', LogisticRegression(max_iter=500, solver='liblinear')),
    ('ridge', RidgeClassifier(alpha=0.1, solver='cholesky', tol=0.001))
]

stacking_model = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(max_iter=500))

# 10. Modelni o'qitish
stacking_model.fit(X_train, y_train)

# 11. ROC AUC hisoblash
y_prob = stacking_model.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_prob)
print(f"ROC-AUC: {roc_auc}")

# 12. Test ma'lumotlarini yuklash va tayyorlash
df_test = pd.read_csv('test.csv').dropna(subset=['Surname'])

# Kategorik xususiyatlarni kodlash
df_test['Gender'] = label_encoder.transform(df_test['Gender'])
df_test = pd.get_dummies(df_test, columns=['Geography'], drop_first=True)

# Yangi xususiyatlar yaratish (test ma'lumotlarida)
df_test['Age_to_NumOfProducts'] = df_test['Age'] / df_test['NumOfProducts']
df_test['Balance_Tenure'] = df_test['Balance'] * df_test['Tenure']
df_test['Age_NumOfProducts'] = df_test['Age'] * df_test['NumOfProducts']
df_test['Balance_to_CreditScore'] = df_test['Balance'] / df_test['CreditScore']
df_test['Balance_to_NumOfProducts'] = df_test['Balance'] / df_test['NumOfProducts']
df_test['Balance_to_Salary'] = df_test['Balance'] / df_test['EstimatedSalary']
df_test['Tenure_to_Age'] = df_test['Tenure'] / df_test['Age']
df_test['CreditScore_to_Age'] = df_test['CreditScore'] / df_test['Age']
df_test['Balance_NumOfProducts'] = df_test['Balance'] * df_test['NumOfProducts']
df_test['CreditScore_IsActive'] = df_test['CreditScore'] * df_test['IsActiveMember']

# Polynomial features test set uchun yaratish
poly_test_features = poly.transform(df_test[['CreditScore', 'Age', 'Balance']])
poly_test_df = pd.DataFrame(poly_test_features, columns=[f'poly_{i}' for i in range(poly_test_features.shape[1])])
df_test = pd.concat([df_test, poly_test_df], axis=1)

# Test ma'lumotlarini masshtablash
# Ustunlar nomlarini tekshiramiz
columns_to_drop = []
# Faqat mavjud ustunlarni olib tashlaymiz
columns_to_drop = [col for col in columns_to_drop if col in df_test.columns]

# Test ma'lumotlarini masshtablash
X_test_scaled = scaler.transform(df_test.reindex(columns=X.columns, fill_value=0).drop(columns=columns_to_drop))

# Test to'plamida ehtimollarni bashorat qilish
y_test_prob = stacking_model.predict_proba(X_test_scaled)[:, 1]

# 13. Bashoratlarni saqlash
subm = pd.read_csv("sample_submission.csv")
subm['Exited'] = y_test_prob
subm.to_csv("submission5.csv", index=False)


ROC-AUC: 0.8584534481849179
