<a href="https://colab.research.google.com/github/Sirrat21/AI-ML-internship-adv-tasks/blob/main/Adv_Task_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

In [None]:
df=pd.read_csv('/content/WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [None]:
df = df[df["TotalCharges"] != " "]
df["TotalCharges"] = df["TotalCharges"].astype(float)

if "customerID" in df.columns:
    df.drop("customerID", axis=1, inplace=True)

In [None]:
X = df.drop("Churn", axis=1)
y = df["Churn"].map({'Yes': 1, 'No': 0})

In [None]:
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, numeric_cols),
    ("cat", categorical_pipeline, categorical_cols)
])

In [None]:
model_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])

In [None]:
param_grid = {
    "classifier__n_estimators": [100, 200],
    "classifier__max_depth": [None, 10, 20]
}

In [None]:
models = {
    "RandomForest": RandomForestClassifier(random_state=42),
    "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42)
}

In [None]:
param_grids = {
    "RandomForest": {
        "classifier__n_estimators": [100, 200],
        "classifier__max_depth": [None, 10, 20]
    },
    "LogisticRegression": {
        "classifier__C": [0.01, 0.1, 1.0, 10]
    }
}

In [None]:
for name, model in models.items():
    print(f"\nTraining and evaluating: {name}")
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("classifier", model)
    ])
    grid = GridSearchCV(pipeline, param_grids[name], cv=3, scoring="accuracy", n_jobs=-1)
    grid.fit(X_train, y_train)
    y_pred = grid.predict(X_test)
    print(f"Best Parameters: {grid.best_params_}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    joblib.dump(grid.best_estimator_, f"{name}_churn_pipeline.joblib")
    print(f"Exported model: {name}_churn_pipeline.joblib")


Training and evaluating: RandomForest
Best Parameters: {'classifier__max_depth': 10, 'classifier__n_estimators': 200}
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.91      0.87      1033
           1       0.66      0.50      0.57       374

    accuracy                           0.80      1407
   macro avg       0.75      0.70      0.72      1407
weighted avg       0.79      0.80      0.79      1407

Accuracy: 0.7989
Exported model: RandomForest_churn_pipeline.joblib

Training and evaluating: LogisticRegression
Best Parameters: {'classifier__C': 10}
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.89      0.86      1033
           1       0.62      0.52      0.56       374

    accuracy                           0.79      1407
   macro avg       0.73      0.70      0.71      1407
weighted avg       0.78      0.79      0.78      1407

Accuracy: 0.7882
Exported mod

In [None]:
grid_search = GridSearchCV(model_pipeline, param_grid, cv=3, scoring="accuracy", n_jobs=-1)
grid_search.fit(X_train, y_train)

In [None]:
y_pred = grid_search.predict(X_test)
print("Classification Report:\n")
print(classification_report(y_test, y_pred))

Classification Report:

              precision    recall  f1-score   support

           0       0.83      0.91      0.87      1033
           1       0.66      0.50      0.57       374

    accuracy                           0.80      1407
   macro avg       0.75      0.70      0.72      1407
weighted avg       0.79      0.80      0.79      1407



In [None]:
joblib.dump(grid_search.best_estimator_, "telco_churn_pipeline.joblib")
print(" Model exported as telco_churn_pipeline.joblib")

 Model exported as telco_churn_pipeline.joblib


In [None]:
model = joblib.load("telco_churn_pipeline.joblib")

In [None]:
sample = X_test.iloc[[0]]
print("Prediction:", model.predict(sample))

Prediction: [0]


# **Conclusion**
In this Task, we developed a machine learning pipeline to predict customer churn using the Telco dataset.

After performing data preprocessing, feature engineering, and model selection through GridSearchCV,

we successfully trained and evaluated a classification model.

The final model was saved using joblib for future deployment.

The classification report indicates the model's effectiveness in identifying customers likely to churn,

which can help businesses proactively address customer retention.

This predictive system can be further improved by incorporating real-time data

and additional behavioral metrics to enhance accuracy and decision-making support.
