In [33]:
# Load the dataset directly from a CSV file
Customer_Churn <- read.csv("/content/Churn dataset.csv", stringsAsFactors = TRUE, sep = ",", na.strings = c("", "NA"))

# Summarize the dataset
summary(Customer_Churn)

# Remove columns with more than 500,000 null values
Customer_Churn <- Customer_Churn[, colSums(is.na(Customer_Churn)) <= 500000]

# Remove specific columns (term_reas_code, term_reas_desc, X)
Customer_Churn <- Customer_Churn[, !names(Customer_Churn) %in% c("term_reas_code", "term_reas_desc", "X")]

# Remove rows with any missing values
Clean_Customer_Churn <- na.omit(Customer_Churn)

# Categorize contract months variable every 12 months
Clean_Customer_Churn$Contract_Month_Cat <- cut(Clean_Customer_Churn$contract_month,
                                              breaks = c(-Inf, 12, 24, 36, 48, 60, Inf),
                                              labels = c('0 - 12 Month', '12 - 24 Month', '24 - 36 Month',
                                                         '36 - 48 Month', '48 - 60 Month', '60 - 120 Month'),
                                              include.lowest = TRUE)

Clean_Customer_Churn$Contract_Month_Cat <- as.factor(Clean_Customer_Churn$Contract_Month_Cat)

# Categorize tenure months variable every 72 months
Clean_Customer_Churn$tenure_Cat <- cut(Clean_Customer_Churn$tenure,
                                       breaks = c(-Inf, 72, 144, Inf),
                                       labels = c('0 - 72 Month', '72 - 144 Month', '>144 Month'),
                                       include.lowest = TRUE)

Clean_Customer_Churn$tenure_Cat <- as.factor(Clean_Customer_Churn$tenure_Cat)

# Remove unnecessary columns (contract_month, tenure, serv_type, bill_cycl)
Clean_Customer_Churn <- Clean_Customer_Churn[, !names(Clean_Customer_Churn) %in% c("contract_month", "tenure", "serv_type", "bill_cycl")]

# Save the cleaned data to a new CSV file
write.csv(Clean_Customer_Churn, "/content/cleandataset.csv", quote = FALSE, row.names = FALSE)

# Validation: Load the cleaned data
Clean_Customer_Churn <- read.csv("/content/cleandataset.csv", stringsAsFactors = TRUE)


     image                         newacct_no     line_stat     bill_cycl
 Min.   :201801   70068143.001.000000062:    24   AC:494338   Min.   :1  
 1st Qu.:201807   70068143.003.000072630:    24   CN:  5503   1st Qu.:1  
 Median :201901   70071840.001.000000066:    24   IS: 10284   Median :1  
 Mean   :201858   70071840.003.000060034:    24               Mean   :1  
 3rd Qu.:201907   70082185.001.000000083:    24               3rd Qu.:1  
 Max.   :201912   70101548.001.000000103:    24               Max.   :1  
                  (Other)               :509981                          
 serv_type       serv_code          tenure                  effc_strt_date  
 BBS:510125   VS100DUN: 94144   Min.   :  0.0   01.08.2018 00:00:00:  4185  
              VS100DUR: 70542   1st Qu.: 18.0   01.06.2019 00:00:00:  4159  
              PF100DUN: 36517   Median : 45.0   01.11.2019 00:00:00:  3784  
              VS100FSN: 33449   Mean   : 60.7   01.08.2019 00:00:00:  3748  
              VS100FSR: