# R Visualization for Telco Customer Churn Analysis
## CC6058ES Big Data and Visualisation Coursework

**Student ID:** E285181  
**Date:** December 2025

This notebook demonstrates R visualization techniques for the Telco Customer Churn dataset, including decision trees, Naive Bayes classification, and clustering visualizations.

In [None]:
# Load required libraries
library(tidyverse)
library(caret)
library(rpart)
library(rpart.plot)
library(e1071)  # For Naive Bayes
library(cluster)
library(factoextra)
library(ggplot2)
library(gridExtra)

# Set working directory
setwd("C:/Users/DELL/OneDrive/Desktop/BDV Coursework/New folder/Telco_Churn_Project")

# Load the dataset
data <- read.csv("data/raw/Telco_Customer_Churn_Dataset.csv")

# Display first few rows
head(data)

# Summary statistics
summary(data)

In [None]:
# Data Preprocessing
# Convert Churn to factor
data$Churn <- as.factor(data$Churn)

# Handle missing values in TotalCharges
data$TotalCharges <- ifelse(is.na(data$TotalCharges), median(data$TotalCharges, na.rm = TRUE), data$TotalCharges)

# Convert categorical variables to factors
categorical_cols <- c("gender", "Partner", "Dependents", "PhoneService", "MultipleLines",
                     "InternetService", "OnlineSecurity", "OnlineBackup", "DeviceProtection",
                     "TechSupport", "StreamingTV", "StreamingMovies", "Contract",
                     "PaperlessBilling", "PaymentMethod")

data[categorical_cols] <- lapply(data[categorical_cols], as.factor)

# Convert SeniorCitizen to factor
data$SeniorCitizen <- as.factor(data$SeniorCitizen)

# Select relevant features for modeling
features <- c("gender", "SeniorCitizen", "Partner", "Dependents", "tenure",
              "PhoneService", "MultipleLines", "InternetService", "OnlineSecurity",
              "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV",
              "StreamingMovies", "Contract", "PaperlessBilling", "PaymentMethod",
              "MonthlyCharges", "TotalCharges", "Churn")

data_model <- data[, features]

# Split data into training and testing sets
set.seed(123)
trainIndex <- createDataPartition(data_model$Churn, p = 0.7, list = FALSE)
trainData <- data_model[trainIndex, ]
testData <- data_model[-trainIndex, ]

print("Data preprocessing completed")
print(dim(trainData))
print(dim(testData))

In [None]:
# Decision Tree Classification
# Train decision tree model
dt_model <- rpart(Churn ~ ., data = trainData, method = "class",
                  control = rpart.control(minsplit = 20, minbucket = 7, maxdepth = 5))

# Print model summary
printcp(dt_model)

# Visualize the decision tree
rpart.plot(dt_model, type = 4, extra = 101, under = TRUE, fallen.leaves = TRUE,
           main = "Decision Tree for Customer Churn Prediction",
           box.palette = "RdYlGn")

# Make predictions
dt_predictions <- predict(dt_model, testData, type = "class")

# Confusion matrix
dt_cm <- confusionMatrix(dt_predictions, testData$Churn)
print(dt_cm)

# Feature importance
dt_importance <- varImp(dt_model)
print(dt_importance)

In [None]:
# Naive Bayes Classification
# Train Naive Bayes model
nb_model <- naiveBayes(Churn ~ ., data = trainData)

# Print model summary
print(nb_model)

# Make predictions
nb_predictions <- predict(nb_model, testData)

# Confusion matrix
nb_cm <- confusionMatrix(nb_predictions, testData$Churn)
print(nb_cm)

# Visualize prediction probabilities
nb_prob <- predict(nb_model, testData, type = "raw")
prob_df <- data.frame(Actual = testData$Churn, Prob_Churn = nb_prob[,2])

ggplot(prob_df, aes(x = Prob_Churn, fill = Actual)) +
  geom_histogram(alpha = 0.7, bins = 30) +
  labs(title = "Naive Bayes Prediction Probabilities",
       x = "Probability of Churn", y = "Count") +
  theme_minimal() +
  scale_fill_manual(values = c("No" = "green", "Yes" = "red"))

In [None]:
# K-Means Clustering
# Select numerical features for clustering
numeric_features <- c("tenure", "MonthlyCharges", "TotalCharges")
cluster_data <- data[, numeric_features]

# Scale the data
cluster_data_scaled <- scale(cluster_data)

# Determine optimal number of clusters using elbow method
wss <- sapply(1:10, function(k) {
  kmeans(cluster_data_scaled, centers = k, nstart = 10)$tot.withinss
})

# Plot elbow curve
elbow_df <- data.frame(k = 1:10, wss = wss)
ggplot(elbow_df, aes(x = k, y = wss)) +
  geom_line() +
  geom_point() +
  labs(title = "Elbow Method for Optimal K",
       x = "Number of Clusters (k)", y = "Total Within-Cluster Sum of Squares") +
  theme_minimal()

# Perform K-means clustering with k=3
set.seed(123)
kmeans_result <- kmeans(cluster_data_scaled, centers = 3, nstart = 10)

# Add cluster labels to original data
data$Cluster <- as.factor(kmeans_result$cluster)

# Visualize clusters
fviz_cluster(kmeans_result, data = cluster_data_scaled,
             geom = "point", stand = FALSE,
             main = "K-Means Clustering of Customers")

# Analyze cluster characteristics
cluster_summary <- data %>%
  group_by(Cluster) %>%
  summarise(
    Count = n(),
    Avg_Tenure = mean(tenure),
    Avg_MonthlyCharges = mean(MonthlyCharges),
    Avg_TotalCharges = mean(TotalCharges),
    Churn_Rate = mean(Churn == "Yes") * 100
  )

print(cluster_summary)

# Visualize cluster profiles
cluster_long <- cluster_summary %>%
  select(Cluster, Avg_Tenure, Avg_MonthlyCharges, Avg_TotalCharges, Churn_Rate) %>%
  pivot_longer(cols = -Cluster, names_to = "Metric", values_to = "Value")

ggplot(cluster_long, aes(x = Cluster, y = Value, fill = Metric)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~Metric, scales = "free_y") +
  labs(title = "Customer Cluster Profiles",
       x = "Cluster", y = "Value") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Model Comparison and Evaluation

## Performance Metrics Comparison

```{r}
# Compare model accuracies
models <- c("Decision Tree", "Naive Bayes")
accuracies <- c(dt_cm$overall['Accuracy'], nb_cm$overall['Accuracy'])
precision <- c(dt_cm$byClass['Precision'], nb_cm$byClass['Precision'])
recall <- c(dt_cm$byClass['Recall'], nb_cm$byClass['Recall'])
f1 <- c(dt_cm$byClass['F1'], nb_cm$byClass['F1'])

comparison_df <- data.frame(
  Model = models,
  Accuracy = accuracies,
  Precision = precision,
  Recall = recall,
  F1_Score = f1
)

print(comparison_df)

# Visualize comparison
comparison_long <- comparison_df %>%
  pivot_longer(cols = -Model, names_to = "Metric", values_to = "Value")

ggplot(comparison_long, aes(x = Model, y = Value, fill = Model)) +
  geom_bar(stat = "identity") +
  facet_wrap(~Metric) +
  labs(title = "Model Performance Comparison",
       x = "Model", y = "Score") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
```

## Summary

This notebook demonstrates R's capabilities for statistical modeling and visualization:

1. **Decision Trees**: Provide interpretable rules for churn prediction with good accuracy
2. **Naive Bayes**: Fast probabilistic classification suitable for categorical features
3. **K-Means Clustering**: Unsupervised learning to segment customers into meaningful groups

R excels at statistical analysis and model interpretability, making it ideal for exploratory data analysis and model development in business contexts.