In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc

In [2]:
# Hypertuning parameters

dt_classifier = DecisionTreeClassifier()

# get hyperparameters
hyperparameters = dt_classifier.get_params()

# print hyperparameters
print("Hyperparameters of DecisionTreeClassifier:")
for param, value in hyperparameters.items():
    print(f"{param}: {value}")

Hyperparameters of DecisionTreeClassifier:
ccp_alpha: 0.0
class_weight: None
criterion: gini
max_depth: None
max_features: None
max_leaf_nodes: None
min_impurity_decrease: 0.0
min_samples_leaf: 1
min_samples_split: 2
min_weight_fraction_leaf: 0.0
monotonic_cst: None
random_state: None
splitter: best


In [3]:
# Logistic Regression hyperparameters
from sklearn.linear_model import LogisticRegression
log_reg_classifier = LogisticRegression()
log_reg_hyperparameters = log_reg_classifier.get_params()
print("\nHyperparameters of LogisticRegression:")
for param, value in log_reg_hyperparameters.items():
    print(f"{param}: {value}")


Hyperparameters of LogisticRegression:
C: 1.0
class_weight: None
dual: False
fit_intercept: True
intercept_scaling: 1
l1_ratio: None
max_iter: 100
multi_class: deprecated
n_jobs: None
penalty: l2
random_state: None
solver: lbfgs
tol: 0.0001
verbose: 0
warm_start: False


In [4]:
# linear regression hyperparameters
from sklearn.linear_model import LinearRegression
linear_reg_classifier = LinearRegression()
linear_reg_hyperparameters = linear_reg_classifier.get_params()
print("\nHyperparameters of LinearRegression:")
for param, value in linear_reg_hyperparameters.items():
    print(f"{param}: {value}")


Hyperparameters of LinearRegression:
copy_X: True
fit_intercept: True
n_jobs: None
positive: False


In [5]:
# USe GridSearchCV to find the best hyperparameters
from sklearn.model_selection import GridSearchCV

data = pd.read_csv('bigml_59c28831336c6604c800002a.csv')
data.head()

X = data.drop(columns=['churn', 'voice mail plan', 'international plan', 'phone number', 'state'])
y = data['churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [9]:
# Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(random_state=42)

# Define the hyperparameters and their values 
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [2, 3, 4],
}

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=dt_classifier, param_grid=param_grid,
                           scoring='accuracy', cv=5)
# Fit the grid search to the training data
grid_search.fit(X_train, y_train)
# Get the best hyperparameters
best_hyperparameters = grid_search.best_params_
print("\nBest hyperparameters for Decision Tree Classifier:")
for param, value in best_hyperparameters.items():
    print(f"{param}: {value}")


Best hyperparameters for Decision Tree Classifier:
criterion: gini
max_depth: 5
min_samples_leaf: 2
min_samples_split: 5


Pipeline


In [17]:
# pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

churn_data = pd.read_csv('bigml_59c28831336c6604c800002a.csv')
churn_data.head()


X = churn_data.drop(['state', 'phone number'], axis=1)
y = churn_data['churn']

categorical_features = ['international plan', 'voice mail plan']
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', DecisionTreeClassifier(random_state=42))])

# Define the hyperparameters and their values
param_grid = {
    'classifier__criterion': ['gini', 'entropy'],
    'classifier__max_depth': [None, 5, 10, 15],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [2, 3, 4],
}
# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid,
                           scoring='accuracy', cv=5)
# Fit the grid search to the training data
grid_search.fit(X, y)
# Get the best hyperparameters
best_hyperparameters = grid_search.best_params_
print("\nBest hyperparameters for Decision Tree Classifier with pipeline:")
for param, value in best_hyperparameters.items():
    print(f"{param}: {value}")

# best score
best_score = grid_search.best_score_
print(f"\nBest score: {best_score}")
# best parameter
best_params = grid_search.best_params_
print(f"\nBest parameters: {best_params}")



Best hyperparameters for Decision Tree Classifier with pipeline:
classifier__criterion: gini
classifier__max_depth: 10
classifier__min_samples_leaf: 2
classifier__min_samples_split: 10

Best score: 0.9432981707344525

Best parameters: {'classifier__criterion': 'gini', 'classifier__max_depth': 10, 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 10}
