In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import joblib

# -----------------
# Preprocessing function
# -----------------
def preprocess_data(df):
    dfcopy = df.copy(deep=True)
    le = preprocessing.LabelEncoder()
    
    # Encode binary columns
    binary_cols = [col for col in dfcopy.columns if dfcopy[col].dtype == 'object' and dfcopy[col].nunique() == 2]
    for col in binary_cols:
        dfcopy[col] = le.fit_transform(dfcopy[col])
    
    # One-hot encode multi-class columns
    multiple_cols = [col for col in dfcopy.columns if dfcopy[col].dtype == 'object' and dfcopy[col].nunique() > 2]
    dfcopy = pd.get_dummies(dfcopy, columns=multiple_cols, dtype=int)
    
    return dfcopy

# -----------------
# Load & Clean Data
# -----------------
df = pd.read_csv('/Users/ekaterinasharifullina/Documents/my_env/programming/churn/data/WA_Fn-UseC_-Telco-Customer-Churn.csv')

# Fix TotalCharges and drop missing
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df = df[df['TotalCharges'].notna()].reset_index(drop=True)

# Extract target
y = df['Churn'].map({'Yes': 1, 'No': 0})

# Drop Churn from features
df = df.drop('Churn', axis=1)

# Preprocess features
df_clean = preprocess_data(df)

# Scale numerical columns
scaler = StandardScaler()
df_clean[['tenure', 'MonthlyCharges', 'TotalCharges']] = scaler.fit_transform(
    df_clean[['tenure', 'MonthlyCharges', 'TotalCharges']]
)
X = df_clean

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# -----------------
# Train Final Logistic Regression
# -----------------
model = LogisticRegression(class_weight='balanced', max_iter=1000, solver='liblinear')
model.fit(X_train, y_train)

# -----------------
# Evaluate
# -----------------
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))

# -----------------
# Save Model and Scaler
# -----------------
joblib.dump(model, "churn_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(list(X.columns), "feature_columns.pkl")

print("Model and scaler saved successfully!")



Accuracy: 0.744136460554371
Precision: 0.5127272727272727
Recall: 0.7540106951871658
F1 Score: 0.6103896103896104
ROC AUC: 0.8349635814899752
Model and scaler saved successfully!


Accuracy: 0.8038379530916845
Precision: 0.6484848484848484
Recall: 0.5721925133689839
F1 Score: 0.6079545454545454
ROC AUC: 0.7299491124444146