<a href="https://colab.research.google.com/github/Winindu/ML_CW/blob/master/Copy_of_telco.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os, random
import numpy as np
import pandas as pd

SEED = 42
np.random.seed(SEED)
random.seed(SEED)

df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Telco Customers.csv")

df.drop_duplicates(inplace=True)
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"].replace(" ", np.nan), errors="coerce")
df["TotalCharges"].fillna(df["MonthlyCharges"] * df["tenure"], inplace=True)

if "customerID" in df.columns:
    df.drop(columns=["customerID"], inplace=True)

print(df.head())
print(df.shape)


In [None]:
print("\n=== Dataset Shape ===")
print(df.shape)

print("\n=== First 5 Rows ===")
display(df.head())

print("\n=== Data Types ===")
print(df.dtypes)

print("\n=== Summary Statistics (Numerical) ===")
display(df.describe())

print("\n=== Summary Statistics (Categorical) ===")
display(df.describe(include=['object']))


In [None]:
plt.figure(figsize=(10,4))
sns.heatmap(df.isnull(), cbar=False, cmap="viridis")
plt.title("Missing Values Heatmap")
plt.show()


In [None]:
plt.figure(figsize=(5,3))
sns.countplot(data=df, x="Churn", palette="coolwarm")
plt.title("Churn Distribution")
plt.show()


In [None]:
num_cols = df.select_dtypes(include=[np.number]).columns
df[num_cols].hist(figsize=(12,8), bins=30)
plt.suptitle("Numerical Feature Distributions", y=1.02)
plt.show()


In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(df[num_cols].corr(), annot=False, cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()


In [None]:
for col in num_cols:
    plt.figure(figsize=(6,4))
    sns.boxplot(data=df, x="Churn", y=col)
    plt.title(f"{col} vs Churn")
    plt.show()


In [None]:
y = df["Churn"].map({"No":0, "Yes":1})
X = df.drop(columns=["Churn"])

num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()

try:
    ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
except:
    ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)

preprocessor = ColumnTransformer([
    ("num", Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ]), num_cols),

    ("cat", Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", ohe)
    ]), cat_cols)
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=SEED
)

preprocessor.fit(X_train)
X_train_prep = preprocessor.transform(X_train)
X_test_prep = preprocessor.transform(X_test)


In [None]:
sm = SMOTE(random_state=SEED)
X_train_bal, y_train_bal = sm.fit_resample(X_train_prep, y_train)
print("Balanced classes:", np.bincount(y_train_bal))


In [None]:
dt = DecisionTreeClassifier(
    random_state=SEED,
    max_depth=4,
    min_samples_leaf=20
)

param_grid = {"min_samples_split": [10, 20, 30]}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

grid_dt = GridSearchCV(dt, param_grid, cv=cv, scoring="f1")
grid_dt.fit(X_train_bal, y_train_bal)
best_dt = grid_dt.best_estimator_


In [None]:
dt_pred = best_dt.predict(X_test_prep)
dt_proba = best_dt.predict_proba(X_test_prep)[:,1]

print("\n=== DECISION TREE REPORT ===")
print(classification_report(y_test, dt_pred))
print("AUC:", roc_auc_score(y_test, dt_proba))


In [None]:
cm_dt = confusion_matrix(y_test, dt_pred)
sns.heatmap(cm_dt, annot=True, fmt="d", cmap="Blues")
plt.title("Decision Tree - Confusion Matrix")
plt.show()


In [None]:
def build_nn(n1, n2, lr, d1, d2):
    model = keras.Sequential([
        layers.Input(shape=(input_dim,)),
        layers.Dense(n1, activation="relu"),
        layers.Dropout(d1),
        layers.Dense(n2, activation="relu"),
        layers.Dropout(d2),
        layers.Dense(1, activation="sigmoid")
    ])
    model.compile(optimizer=keras.optimizers.Adam(lr),
                  loss="binary_crossentropy",
                  metrics=["accuracy"])
    return model


In [None]:

for n1 in param_grid_nn["n1"]:
    for n2 in param_grid_nn["n2"]:
        for lr in param_grid_nn["lr"]:
            for batch in param_grid_nn["batch_size"]:
                for ep in param_grid_nn["epochs"]:
                    for d1 in param_grid_nn["dropout1"]:
                        for d2 in param_grid_nn["dropout2"]:

                            model = build_nn(n1, n2, lr, d1, d2)
                            model.fit(X_train_prep, y_train,
                                      validation_split=0.2,
                                      epochs=ep,
                                      batch_size=batch,
                                      callbacks=[es],
                                      verbose=0,
                                      class_weight=class_weight )

                            pred = (model.predict(X_test_prep).ravel() >= 0.5).astype(int)
                            f1 = f1_score(y_test, pred)

                            print(f"F1={f1:.4f} | n1={n1} n2={n2} lr={lr}")

                            if f1 > best_f1:
                                best_f1 = f1
                                best_model = model
                                best_params = (n1, n2, lr, batch, ep, d1, d2)

print("\n=== BEST ANN PARAMETERS ===", best_params)
print("Best F1:", best_f1)