In [None]:
# For data manipulation
import pandas as pd
import numpy as np

# For model training and evaluation
from sklearn.model_selection import train_test_split
import sklearn.model_selection as ms
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import statsmodels.api as sm
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
# For visualization
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Image
import pydotplus
%matplotlib inline

# For Google Colab integration
import os
from google.colab import drive

drive.mount('/content/drive')

In [None]:
# import data as dataframe
file_path = '/content/drive/MyDrive/Infor648/Data/churn.csv'
df = pd.read_csv(file_path)

# calling head() method
df.head()

In [None]:
display(df.isna().sum()) ##check missing value
df = df.dropna() ##drop missing value

In [None]:
display(df.isna().sum()) ##recheck missing value again

In [None]:
# Numeric Variables
numeric_variables = [col for col in df.columns if df[col].dtype != "object" and col not in "Customer Status"] ##exclude our target variable: customer status
numeric_variables

#Examine correlation among independent variables

In [None]:
df_test  = df[["Age", "Number of Dependents","Total Charges","Monthly Charge","Total Refunds", "Tenure in Months", "Number of Referrals"]]

In [None]:
corr_matrix = df_test.corr()
plt.figure(figsize=(5,5)) ###change the figure size here
sns.heatmap(corr_matrix, cmap='Blues', annot=True)
plt.show()

#Multicollinearity check

VIF stands for Variance Inflation Factor, measures how much the variance of a regression coefficient is inflated due to multicollinearity with other variables in the model.

It is used to detect the presence of multicollinearity in a regression analysis. Multicollinearity occurs when two or more predictor variables (independent variables) in a regression model are highly correlated, meaning that they provide redundant information and affect the reliability of the regression coefficients.

VIF = 1: No multicollinearity. The predictor is not correlated with any other variables.

VIF between 1 and 5: Moderate multicollinearity. Generally acceptable, though closer to 5 might be a concern.

VIF > 5: High multicollinearity. The predictor is highly correlated with other predictors, which may affect the reliability of the coefficient estimates.

VIF > 10: Severe multicollinearity. The predictor is very highly correlated with other variables, and steps should be taken to reduce multicollinearity (e.g., removing one of the correlated variables).

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# VIF dataframe
vif_data = pd.DataFrame()
vif_data["feature"] = df_test.columns

# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(df_test.values, i)
                          for i in range(len(df_test.columns))]

print(vif_data)

In [None]:
df_test_2 = df[["Age", "Number of Dependents","Monthly Charge","Total Refunds", "Tenure in Months", "Number of Referrals"]]
# VIF dataframe
vif_data = pd.DataFrame()
vif_data["feature"] = df_test_2.columns

# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(df_test_2.values, i)
                          for i in range(len(df_test_2.columns))]

print(vif_data)

#Select our features and our target variable

In [None]:
df_sub = df[["Age", "Number of Dependents","Monthly Charge","Total Refunds", "Tenure in Months", "Number of Referrals", "Customer Status"]]

In [None]:
from sklearn.preprocessing import LabelEncoder
####We are only interested in why people stayed and churned
df_sub = df_sub[df_sub['Customer Status'] !='Joined'] # we drop all the new customers

####Encode our target variable
target_label_encoder = LabelEncoder()
df_sub['Customer Status'] = target_label_encoder.fit_transform(df_sub['Customer Status'])


##display the stats after encoding
display(df_sub['Customer Status'].value_counts())
mapping = dict(zip(target_label_encoder.classes_, target_label_encoder.transform(target_label_encoder.classes_)))
print(mapping)

# Define features (X) and target (y)

In [None]:
from sklearn.preprocessing import StandardScaler
X = df_sub.drop('Customer Status', axis=1)  # Drop the target column to get independent variables
y = df_sub['Customer Status']  # Select the target column directly as our y

feature_names = X.columns.tolist()



class_names = target_label_encoder.inverse_transform(np.arange(len(target_label_encoder.classes_)))

##print out the features we selected for predictions and our classification target
print("features:",feature_names)
print("Classes:", class_names)

In [None]:
# Standardize the independent variables
scaler = StandardScaler()
X = scaler.fit_transform(X)


# Split the dataset into training and testing sets (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

#Train our logit model

In [None]:
# Train the Logistic Regression model
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = logistic_model.predict(X_test)


# Create a DataFrame for evaluation metrics
evaluation_metrics = pd.DataFrame({
    "Evaluation Metric": ["Train Accuracy", "Test Accuracy", "Recall", "Precision", "F1 Score"],
    "Value": [
        logistic_model.score(X_train, y_train),
        accuracy_score(y_test, y_pred),
        recall_score(y_test, y_pred),
        precision_score(y_test, y_pred),
        f1_score(y_test, y_pred)
    ]
})

# Display the DataFrame with evaluation metrics
print("Logistic Regression Evaluation Metrics:")
display(evaluation_metrics)


In [None]:
# Generate the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Plot the confusion matrix using seaborn
plt.figure(figsize=(8, 6))  # Adjust figure size if needed
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
plt.ylabel('True Class')
plt.xlabel('Predicted Class')
plt.title('Confusion Matrix for Logistic Regression Model')
plt.show()

In [None]:
from yellowbrick.classifier import ClassificationReport

# Create the classification report visualizer for the Logistic Regression model
visualizer = ClassificationReport(logistic_model, classes=class_names, support=False, title="Logistic Regression Classifier Evaluation")

# Fit the visualizer
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)

# Display the plot
visualizer.show()


#This method provides a more detailed statistical summary, including p-values and other key metrics.

In [None]:
# Train the Logistic Regression model using statsmodels
model_sm = sm.Logit(y_train, X_train)
results = model_sm.fit(method="newton")

# Create a summary with the feature names
features = feature_names
print(results.summary2(xname=features))


###Use P-value for statistically significant
##LLR p-value is the overall significance of the logit regression model
##p-value for each feauture is the feature significance for prediction
##p-value <0.01 <0.05 <0.10 if p-value >= 0.10 no evidence of stats significant
##LL-null is the baseline reference model
##R^2 how much better the model performs

In [None]:
# Get the coefficients for each feature
coefficients = logistic_model.coef_[0]

# Create a DataFrame to display the feature names and their corresponding coefficients
feature_impact = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients,
    'Odds Ratio': np.exp(coefficients)   # Convert odds ratios using the exponential function (e^x)
})

feature_impact = feature_impact.sort_values(by='Odds Ratio', ascending=False)
display(feature_impact)

In [None]:
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import cross_val_predict


model_cv = LogisticRegression()

# Perform cross-validation and get aggregated predictions
y_pred_cross = cross_val_predict(model_cv, X, y, cv=10)

# Calculate evaluation metrics
accuracy_cv = accuracy_score(y, y_pred_cross)
recall_cv = recall_score(y, y_pred_cross)
precision_cv = precision_score(y, y_pred_cross)
f1_cv = f1_score(y, y_pred_cross)
matrix_cv = confusion_matrix(y, y_pred_cross)

# Create a DataFrame for evaluation metrics with cross-validation
evaluation_metrics_cv = pd.DataFrame({
    "Evaluation Metric": ["Accuracy", "Recall", "Precision", "F1 Score"],
    "Value": [accuracy_cv, recall_cv, precision_cv, f1_cv]
})

print("Performance Metrics With Cross-Validation:")
display(evaluation_metrics_cv)


print("\nConfusion Matrix - Cross Validation:\n", matrix_cv)



# Display the evaluation metrics without CV
print("\nLogistic Regression Evaluation Metrics without CV:")
display(evaluation_metrics)
print("\nConfusion Matrix - Without CV:\n", conf_matrix)

