In [None]:
import pandas as pd
df2 = pd.read_csv('Telco_Customer_Churn.csv')
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from imblearn.combine import SMOTEENN

In [None]:
def transformSplit(df):
  import numpy as np
  from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder, MinMaxScaler
  from sklearn.compose import ColumnTransformer
  from sklearn.impute import SimpleImputer
  from sklearn.model_selection import train_test_split
  from sklearn.pipeline import Pipeline
  from sklearn.metrics import accuracy_score, confusion_matrix
  from sklearn.linear_model import LogisticRegression
  from imblearn.combine import SMOTEENN

  # Load the dataset
  df = pd.read_csv('Telco_Customer_Churn.csv')

  # Convert 'TotalCharges' to numeric, coercing errors
  df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
  df['Tenure_MonthlyCharges'] = df['tenure'] * df['MonthlyCharges']

  # Drop the 'customerID' column
  df.drop("customerID", axis=1, inplace=True)
  X_train, X_test = train_test_split(df, test_size=0.4, random_state=42)
  X_test, X_validation = train_test_split(X_test, test_size=0.5, random_state=42)

  # Separate the target variable 'Churn' from the features
  y_train = X_train[['Churn']]
  X_train.drop("Churn", axis=1, inplace=True)
  y_test = X_test[['Churn']]
  X_test.drop("Churn", axis=1, inplace=True)
  y_validation = X_validation[['Churn']]
  X_validation.drop("Churn", axis=1, inplace=True)



  return X_train, X_test, X_validation, y_train, y_test, y_validation

In [None]:
def transformX(df):
    print(df.iloc[1, :])
    # Identify categorical and numeric columns
    categorical_columns_initial = df.select_dtypes(include=['object', 'category']).columns
    categorical_columns_two_options = [col for col in categorical_columns_initial if df[col].nunique() == 2]
    categorical_columns = [col for col in categorical_columns_initial if col not in categorical_columns_two_options]

    numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

    # Remove 'SeniorCitizen' from numeric columns to passthrough without scaling
    if 'SeniorCitizen' in numeric_columns:
        numeric_columns.remove('SeniorCitizen')

    # Define preprocessing pipeline for numeric and categorical columns

    # Numeric pipeline (Imputation + Scaling)
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),  # Impute missing numeric values
        ('scaler', StandardScaler())  # Scale the numeric features
    ])

    # Categorical pipeline (Two-option categories: Ordinal Encoding, Other Categorical: OneHotEncoding)
    categorical_two_opt_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing categorical (binary)
        ('onehot', OrdinalEncoder())  # Ordinal encode binary categories
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing categorical
        ('onehot', OneHotEncoder(drop='first'))  # One-hot encode other categorical features
    ])

    # Combine all the transformers into a ColumnTransformer
    full_pipeline = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_columns),
        ('cat_two_opt', categorical_two_opt_transformer, categorical_columns_two_options),
        ('cat', categorical_transformer, categorical_columns),
        ('passthrough', 'passthrough', ['SeniorCitizen'])  # Pass 'SeniorCitizen' unchanged
    ])

    # Apply the full pipeline to the data
    df_prepared = full_pipeline.fit_transform(df)

    # Get new column names after one-hot encoding
    encoded_categorical_columns = full_pipeline.transformers_[2][1]['onehot'].get_feature_names_out(categorical_columns)

    # Combine numeric, ordinal, and one-hot encoded feature names
    all_columns = numeric_columns + categorical_columns_two_options + list(encoded_categorical_columns) + ['SeniorCitizen']

    # Convert the transformed NumPy array back to a DataFrame
    df_prepared = pd.DataFrame(df_prepared, columns=all_columns)


    return df_prepared

In [None]:
def transform_target(target_variable):
  ordinal_encoder = OrdinalEncoder()
  treated = ordinal_encoder.fit_transform(target_variable)
  return treated

In [None]:
def transformALL(df):
  X_train, X_test, X_validation, y_train, y_test, y_validation = transformSplit(df)
  X_train = transformX(X_train)
  X_test = transformX(X_test)
  X_validation = transformX(X_validation)
  y_train = transform_target(y_train)
  y_test = transform_target(y_test)
  y_validation = transform_target(y_validation)
  sm = SMOTEENN()
  X_train, y_train = sm.fit_resample(X_train, y_train)
  return X_train, X_test, X_validation, y_train, y_test, y_validation

In [None]:
def returnBestAtt(df, n_att):

  from sklearn.feature_selection import SelectKBest, f_classif
  from sklearn.metrics import accuracy_score, confusion_matrix
  X_train, X_test, X_validation, y_train, y_test, y_validation = transformALL(df)
  # Select the top xx features using f_classif
  selector = SelectKBest(score_func=f_classif, k=n_att)
  X_train_new = selector.fit_transform(X_train, y_train)

  # Get selected feature indices and names
  selected_feature_indices = selector.get_support(indices=True)
  selected_feature_names = X_train.columns[selected_feature_indices]

  # Transform the test data using the same feature selection
  X_test_new = selector.transform(X_test)
  X_validation_new = selector.transform(X_validation)

  return X_train_new, X_test_new, X_validation_new, y_train, y_test, y_validation, selected_feature_names

In [None]:
def useModel(model, param_grid, X_train, y_train, X_test, y_test):
  from sklearn.model_selection import GridSearchCV
  from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_curve, f1_score, precision_score, recall_score
  grid = GridSearchCV(model, param_grid, cv=5)
  grid.fit(X_train, y_train)
  print(f"Best Params: {grid.best_params_}")
  print(f"Best Score: {grid.best_score_}")
  best_model = grid.best_estimator_
  y_pred = best_model.predict(X_test)
  precision = precision_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred)
  print(f"Precision: {precision:.2f}, Recall: {recall:.2f}")
  print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
  print(f"Confusion Matrix: {confusion_matrix(y_test, y_pred, normalize = 'true')}")
  return best_model

In [None]:
def useModelBestThreshold(model, X_train, y_train, X_test, y_test):
  from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_curve, f1_score, precision_score, recall_score

  y_probs = model.predict_proba(X_test)[:, 1]  # Probabilities for the positive class
  precisions, recalls, thresholds = precision_recall_curve(y_test, y_probs)

  # Compute F1 score for each threshold
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  optimal_threshold = thresholds[f1_scores.argmax()]

  print(f'Optimal Threshold: {optimal_threshold}, Best F1 Score: {f1_scores.max()}')

  # Predict with the new threshold
  y_pred_opt = (y_probs >= optimal_threshold).astype(int)

  accuracy = accuracy_score(y_test, y_pred_opt)

  precision = precision_score(y_test, y_pred_opt)
  recall = recall_score(y_test, y_pred_opt)

  print(f"Precision: {precision:.2f}, Recall: {recall:.2f}")
  print(f"Accuracy with best Threshold: {accuracy}")
  print(confusion_matrix(y_test, y_pred_opt, normalize = 'true'))
  print(X_test.columns)

In [None]:
def testAllAtributes(df, model, param_grid):
  for i in range(df.shape[1] - 1, df.shape[1]):
    X_train_new, X_test_new, X_validation_new, y_train, y_test, y_validation, selected_feature_names = returnBestAtt(df, i)
    best_model = useModel(model, param_grid, X_train_new, y_train, X_test_new, y_test)
    print("")
    print("")
    print("")
    print("")
    print("")
    useModelBestThreshold(best_model, X_train_new, y_train, X_test_new, y_test)

In [None]:
param_grid = {'C': [0.1, 1, 10],
         'penalty': ['l1', 'l2']}

model = LogisticRegression(class_weight='balanced', solver = 'liblinear', random_state=42)
testAllAtributes(df2, model, param_grid)

In [None]:
from sklearn.svm import SVC
svm = SVC(class_weight='balanced', probability=True, random_state=42)
param_gridSVM = {'C': [0.1, 1, 10],
         'gamma': ['scale', 'auto']}

testAllAtributes(df2, svm, param_gridSVM)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
param_gridKNN = {'n_neighbors': [3, 5, 7],
         'weights': ['uniform', 'distance']}

testAllAtributes(df2, knn, param_gridKNN)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(class_weight='balanced', random_state=42)
param_gridRF = {'n_estimators': [50, 100, 200],
         'max_depth': [None, 10, 20]}

testAllAtributes(df2, rf, param_gridRF)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier(random_state=42)
param_gridGB = {'n_estimators': [50, 100, 200],
         'learning_rate': [0.05, 0.1, 0.5]}

testAllAtributes(df2, gb, param_gridGB)

In [None]:
from sklearn.ensemble import AdaBoostClassifier
ab = AdaBoostClassifier(algorithm='SAMME', random_state=42)
param_gridAB = {'n_estimators': [50, 100, 200],
         'learning_rate': [0.05, 0.1, 0.5]}

testAllAtributes(df2, ab, param_gridAB)

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=42)
param_gridDT = {'max_depth': [None, 10, 20],
         'min_samples_split': [2, 5, 10]}

testAllAtributes(df2, dt, param_gridDT)

In [None]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
testAllAtributes(df2, nb, {})

In [None]:
def testAllAtributesValidation(df, model, param_grid):
  for i in range(1, df.shape[1]):
    X_train_new, X_test_new, X_validation_new, y_train, y_test, y_validation, selected_feature_names = returnBestAtt(df, i)
    best_model = useModel(model, param_grid, X_train_new, y_train, X_validation_new, y_validation)
    print("")
    print("")
    print("")
    print("")
    print("")
    useModelBestThreshold(best_model, X_train_new, y_train, X_validation_new, y_validation)

In [None]:
testAllAtributesValidation(df2, svm, param_gridSVM)

In [None]:
testAllAtributesValidation(df2, rf, param_gridRF)

In [None]:
testAllAtributesValidation(df2, gb, param_gridGB)

In [None]:
testAllAtributesValidation(df2, ab, param_gridAB)

In [None]:
testAllAtributesValidation(df2, dt, param_gridDT)

In [None]:
testAllAtributesValidation(df2, knn, param_gridKNN)

In [None]:
testAllAtributesValidation(df2, nb, {})

In [None]:
testAllAtributesValidation(df2, model, param_grid)