In [41]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
import pandas as pd
import numpy as np
import seaborn as sns

In [42]:
# Ma'lumotlarni yuklash
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

df_train


Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.00,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.00,2,1.0,1.0,49503.50,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.00,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.00,2,1.0,1.0,15068.83,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165029,165029,15667085,Meng,667,Spain,Female,33.0,2,0.00,1,1.0,1.0,131834.75,0
165030,165030,15665521,Okechukwu,792,France,Male,35.0,3,0.00,1,0.0,0.0,131834.45,0
165031,165031,15664752,Hsia,565,France,Male,31.0,5,0.00,1,1.0,1.0,127429.56,0
165032,165032,15689614,Hsiung,554,Spain,Female,30.0,7,161533.00,1,0.0,1.0,71173.03,0


In [43]:
# Yaroqsiz ustunlarni tushirib tashlash
df_train_cleaned = df_train.drop(['CustomerId', 'Surname', 'id'], axis=1)
df_test_cleaned = df_test.drop(['CustomerId', 'Surname', 'id'], axis=1)
df_train_cleaned

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,668,France,Male,33.0,3,0.00,2,1.0,0.0,181449.97,0
1,627,France,Male,33.0,1,0.00,2,1.0,1.0,49503.50,0
2,678,France,Male,40.0,10,0.00,2,1.0,0.0,184866.69,0
3,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,716,Spain,Male,33.0,5,0.00,2,1.0,1.0,15068.83,0
...,...,...,...,...,...,...,...,...,...,...,...
165029,667,Spain,Female,33.0,2,0.00,1,1.0,1.0,131834.75,0
165030,792,France,Male,35.0,3,0.00,1,0.0,0.0,131834.45,0
165031,565,France,Male,31.0,5,0.00,1,1.0,1.0,127429.56,0
165032,554,Spain,Female,30.0,7,161533.00,1,0.0,1.0,71173.03,0


In [44]:
df_train_cleaned.corr(numeric_only=True)

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
CreditScore,1.0,-0.008918,0.000942,0.006973,0.011361,-0.002828,0.01479,-0.00182,-0.027383
Age,-0.008918,1.0,-0.01083,0.064318,-0.102195,-0.012111,0.00332,-0.005399,0.340768
Tenure,0.000942,-0.01083,1.0,-0.009481,0.007335,0.005327,-0.005532,0.000971,-0.019565
Balance,0.006973,0.064318,-0.009481,1.0,-0.361033,-0.018584,-0.015073,0.008586,0.129743
NumOfProducts,0.011361,-0.102195,0.007335,-0.361033,1.0,0.005482,0.039736,-0.004285,-0.214554
HasCrCard,-0.002828,-0.012111,0.005327,-0.018584,0.005482,1.0,-0.021034,0.004438,-0.022141
IsActiveMember,0.01479,0.00332,-0.005532,-0.015073,0.039736,-0.021034,1.0,-0.00808,-0.210237
EstimatedSalary,-0.00182,-0.005399,0.000971,0.008586,-0.004285,0.004438,-0.00808,1.0,0.018827
Exited,-0.027383,0.340768,-0.019565,0.129743,-0.214554,-0.022141,-0.210237,0.018827,1.0


In [45]:
# Yoshni kategoriyalash
bins = [0, 18, 35, 60, np.inf]  # Yosh toifalari: 0-18, 19-35, 36-60, 61+
labels = ['Youth', 'Young Adult', 'Adult', 'Senior']
df_train_cleaned['AgeGroup'] = pd.cut(df_train_cleaned['Age'], bins=bins, labels=labels, right=False)
df_test_cleaned['AgeGroup'] = pd.cut(df_test_cleaned['Age'], bins=bins, labels=labels, right=False)


In [46]:
# One-Hot Encoding geografik va jinsiy ustunlar uchun
one_hot_encoder = OneHotEncoder(drop='first', sparse_output=False)

# Train to'plamiga kodlashni qo'llash
encoded_train = one_hot_encoder.fit_transform(df_train_cleaned[['Geography', 'Gender', 'AgeGroup']])
encoded_train_df = pd.DataFrame(encoded_train, columns=one_hot_encoder.get_feature_names_out(['Geography', 'Gender', 'AgeGroup']))

df_train_cleaned = pd.concat([df_train_cleaned.drop(['Geography', 'Gender', 'AgeGroup'], axis=1), encoded_train_df], axis=1)

# Test to'plamiga kodlashni qo'llash
encoded_test = one_hot_encoder.transform(df_test_cleaned[['Geography', 'Gender', 'AgeGroup']])
encoded_test_df = pd.DataFrame(encoded_test, columns=one_hot_encoder.get_feature_names_out(['Geography', 'Gender', 'AgeGroup']))

df_test_cleaned = pd.concat([df_test_cleaned.drop(['Geography', 'Gender', 'AgeGroup'], axis=1), encoded_test_df], axis=1)


In [47]:
# Null qiymatlarni tushirib tashlash
df_train_cleaned = df_train_cleaned.dropna()
df_test_cleaned = df_test_cleaned.dropna()

# Yangi xususiyatlar yaratish (feature engineering)
df_train_cleaned['BalanceSalaryRatio'] = df_train_cleaned['Balance'] / df_train_cleaned['EstimatedSalary']
df_test_cleaned['BalanceSalaryRatio'] = df_test_cleaned['Balance'] / df_test_cleaned['EstimatedSalary']


In [48]:
# Maqsad (target) va xususiyatlar (features)ni ajratish
X_train = df_train_cleaned.drop('Exited', axis=1)
y_train = df_train_cleaned['Exited']

# Test to'plamidan id' ustunini saqlab qolamiz
test_ids = df_test['id']

# Test to'plamida 'id' ustunini tozalaymiz
X_test = df_test_cleaned


In [49]:
# Pipeline yaratish
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Masshtablash
    ('poly', PolynomialFeatures(degree=2)),  # Polynomial Features
    ('log_reg', LogisticRegression(max_iter=1000, C=0.1))  # Logistic Regression, C bilan o'zgartirish
])
# Modelni butun train to'plamida fit qilish
pipeline.fit(X_train, y_train)

# Train to'plamida bashorat qilish
y_train_pred_proba = pipeline.predict_proba(X_train)[:, 1]
# Test to'plamida bashorat qilish
y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
# Train ROC AUC hisoblash
train_roc_auc = roc_auc_score(y_train, y_train_pred_proba)
print(f"Train ROC AUC Score: {train_roc_auc}")



Train ROC AUC Score: 0.8861977247734141


In [50]:
# Natijalarni CSV faylga saqlash
output = pd.DataFrame({
    'id': test_ids,  # 'id' ustunini qayta qo'shamiz
    'Exited_Probability': y_pred_proba
})

output.to_csv('test_predictions_pipeline.csv', index=False)
