In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn import preprocessing
import joblib
from sklearn.preprocessing import StandardScaler


# Load data
df = pd.read_csv('/Users/ekaterinasharifullina/Documents/my_env/programming/churn/data/WA_Fn-UseC_-Telco-Customer-Churn.csv')  # You can adjust this path if needed



def preprocess_data(df):
    #all binary categories 
    dfcopy = df.copy(deep=True)
    le= preprocessing.LabelEncoder()
    
    binary_cols = []
    for col in dfcopy.columns:
        if dfcopy[col].dtype == 'object' and dfcopy[col].nunique() == 2:
            binary_cols.append(col)
    for col in binary_cols:
        dfcopy[col] = le.fit_transform(dfcopy[col])
    
    
    multiple_cols = []
    for col in dfcopy.columns:
        if dfcopy[col].dtype == 'object' and dfcopy[col].nunique() > 2:
            multiple_cols.append(col)

    
    dfcopy = pd.get_dummies(dfcopy, columns=multiple_cols, dtype=int)

    return dfcopy


df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df = df[df['TotalCharges'].notna()]
df = df.reset_index(drop=True)

# 2. Extract y AFTER cleaning
y = df['Churn'].map({'Yes': 1, 'No': 0})

# 3. Drop Churn column
df = df.drop('Churn', axis=1)

# 4. Preprocess
df_clean = preprocess_data(df)

scaler = StandardScaler()

df_clean[['tenure', 'MonthlyCharges', 'TotalCharges']] = scaler.fit_transform(
    df_clean[['tenure', 'MonthlyCharges', 'TotalCharges']]
)
X = df_clean

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify= y)

# Model training
model = LogisticRegression(
    C=0.5,              # stronger regularization
    solver='liblinear', # stable for smaller datasets
    penalty='l1'        # encourages sparse coefficients
)
model.fit(X_train, y_train)

# Evaluation
y_prob = model.predict_proba(X_test)[:, 1]
y_pred_threshold = (y_prob > 0.4).astype(int)


print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_pred))

joblib.dump(model,"titanic_model.pkl")

loaded_model = joblib.load("titanic_model.pkl")

loaded_model.predict(X_test[0:1])


Accuracy: 0.8024164889836531
Precision: 0.6445783132530121
Recall: 0.5721925133689839
F1 Score: 0.6062322946175638
ROC AUC: 0.7289810582333788


array([0])

Accuracy: 0.8038379530916845
Precision: 0.6484848484848484
Recall: 0.5721925133689839
F1 Score: 0.6079545454545454
ROC AUC: 0.7299491124444146