In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import streamlit as st

# Load the dataset
df = pd.read_csv("/mnt/data/WA_Fn-UseC_-Telco-Customer-Churn.csv")

# Display basic information about the dataset
print("Dataset Shape:", df.shape)
print("\nColumn Names:", df.columns.tolist())

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Convert categorical variables to numerical
le = LabelEncoder()
df['Churn'] = le.fit_transform(df['Churn'])
categorical_cols = df.select_dtypes(include=['object']).columns
df[categorical_cols] = df[categorical_cols].apply(lambda col: le.fit_transform(col))

# Feature Engineering
df['MonthlyChargesPerTenure'] = df['MonthlyCharges'] / (df['tenure'] + 1)
df['TotalChargesPerTenure'] = df['TotalCharges'] / (df['tenure'] + 1)

# Define features and target variable
X = df.drop(columns=['Churn'])
y = df['Churn']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train multiple models
models = {
    "Random Forest": RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42),
    "Logistic Regression": LogisticRegression()
}

# Store model results
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy
    print(f"\n{name} Accuracy: {accuracy}")
    print(classification_report(y_test, y_pred))

# Select best model
best_model_name = max(results, key=results.get)
best_model = models[best_model_name]
print(f"\nBest Model: {best_model_name} with Accuracy: {results[best_model_name]}")

# Save the best model
joblib.dump(best_model, "customer_churn_model.pkl")
joblib.dump(scaler, "scaler.pkl")

# Deployment using Streamlit
st.title("Customer Churn Prediction App")
st.write("Enter customer details to predict churn.")

def predict_churn(features):
    model = joblib.load("customer_churn_model.pkl")
    scaler = joblib.load("scaler.pkl")
    features_scaled = scaler.transform([features])
    prediction = model.predict(features_scaled)
    return "Churn" if prediction[0] == 1 else "Not Churn"

# Input fields in Streamlit
features_input = []
for col in X.columns:
    value = st.number_input(f"{col}", value=float(0))
    features_input.append(value)

if st.button("Predict Churn"):
    result = predict_churn(features_input)
    st.write("Prediction:", result)


: 