In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [4]:
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml import Pipeline

In [2]:
# Load dataset
telco_data = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [6]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("TelcoChurnPreprocessing").getOrCreate()

In [8]:
# Load the data file
file_path = 'WA_Fn-UseC_-Telco-Customer-Churn.csv'
telco_data = spark.read.csv(file_path, header=True, inferSchema=True)

In [9]:
# Convert TotalCharges to float and handle missing values
telco_data = telco_data.withColumn("TotalCharges", F.col("TotalCharges").cast(FloatType()))
telco_data = telco_data.dropna(subset=["TotalCharges"])

# Create tenure groups
telco_data = telco_data.withColumn("tenure_group", 
                                    F.when(F.col("tenure").between(1, 12), "1-12")
                                    .when(F.col("tenure").between(13, 24), "13-24")
                                    .when(F.col("tenure").between(25, 36), "25-36")
                                    .when(F.col("tenure").between(37, 48), "37-48")
                                    .when(F.col("tenure").between(49, 60), "49-60")
                                    .otherwise("61-72"))

# Drop unnecessary columns
telco_data = telco_data.drop("customerID", "tenure")

# Convert target variable 'Churn' to binary numeric variable
telco_data = telco_data.withColumn("Churn", F.when(F.col("Churn") == "Yes", 1).otherwise(0))

# Convert categorical variables to dummy variables
categorical_cols = [col for col, dtype in telco_data.dtypes if dtype == 'string']
indexers = [StringIndexer(inputCol=column, outputCol=column+"_index") for column in categorical_cols]
pipeline = Pipeline(stages=indexers)
telco_data = pipeline.fit(telco_data).transform(telco_data)

# Drop original categorical columns
telco_data = telco_data.drop(*categorical_cols)

In [12]:
telco_data_pd = telco_data.toPandas()

# Separate features and target variable
X = telco_data_pd.drop('Churn', axis=1)
y = telco_data_pd['Churn']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
from sklearn.tree import DecisionTreeClassifier

In [36]:
models_configs = {
    'model_1': {'max_depth': 3, 'criterion': 'gini', 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'log2'},
    'model_2': {'max_depth': 5, 'criterion': 'gini', 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt'},
    'model_3': {'max_depth': 7, 'criterion': 'entropy', 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2'},
    'model_4': {'max_depth': None, 'criterion': 'entropy', 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': None},
    'model_5': {'max_depth': 3, 'criterion': 'gini', 'min_samples_split': 4, 'min_samples_leaf': 2, 'max_features': 'sqrt'},
    'model_6': {'max_depth': None, 'criterion': 'gini', 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': None},
    }

In [37]:
import mlflow
import mlflow.sklearn
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [38]:
# Define function to train and log models
def train_and_log_model(config_name, model_configs):
    config = model_configs[config_name]
    model = DecisionTreeClassifier(
        max_depth=config['max_depth'],
        criterion=config['criterion'],
        min_samples_split=config['min_samples_split'],
        min_samples_leaf=config['min_samples_leaf'],
        max_features=config['max_features']
    )
    # Train model
    model.fit(X_train, y_train)

    # Make predictions
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    # Log parameters
    mlflow.log_param("max_depth", config['max_depth'])
    mlflow.log_param("criterion", config['criterion'])
    mlflow.log_param("min_samples_split", config['min_samples_split'])
    mlflow.log_param("min_samples_leaf", config['min_samples_leaf'])
    mlflow.log_param("max_features", config['max_features'])

    # Log metrics
    mlflow.log_metric("train_accuracy", accuracy_score(y_train, y_pred_train))
    mlflow.log_metric("test_accuracy", accuracy_score(y_test, y_pred_test))
    mlflow.log_metric("train_precision", precision_score(y_train, y_pred_train))
    mlflow.log_metric("test_precision", precision_score(y_test, y_pred_test))
    mlflow.log_metric("train_recall", recall_score(y_train, y_pred_train))
    mlflow.log_metric("test_recall", recall_score(y_test, y_pred_test))
    mlflow.log_metric("train_f1_score", f1_score(y_train, y_pred_train))
    mlflow.log_metric("test_f1_score", f1_score(y_test, y_pred_test))

    # Log model
    mlflow.sklearn.log_model(model, "model")

In [39]:
# Set experiment name
mlflow.set_experiment("Telco Churn Decision Tree Experiment")

<Experiment: artifact_location='file:///c:/Users/Salem%20Aslam/Documents/3.%20Academics/%23Sem8/Lab/Project-19/mlruns/201476583766961328', creation_time=1716082675097, experiment_id='201476583766961328', last_update_time=1716082675097, lifecycle_stage='active', name='Telco Churn Decision Tree Experiment', tags={}>

In [40]:
with mlflow.start_run(run_name="Main run-1"):
    for config_name in models_configs:
        with mlflow.start_run(nested=True, run_name=config_name):
            train_and_log_model(config_name, models_configs)