In [None]:
customer_df <- read.csv('Wholesale_customers_data.csv')
customer_df$Channel <- NULL
customer_df$Region <- NULL
dim(customer_df)

# Correlation and Redundancy

I claim that there is correlation and redundancy in the `customer` table. What I mean by this is that some features are linear combinations of other features. 

Let's examine redundancy by dropping a feature and seeing if the other features can predict it.

In [None]:
install.packages('rpart')

In [None]:
library(caret)
library(rpart)

In [None]:
calculate_r_2 <- function(actual, prediction) {
    return (1 - (sum((actual-prediction)^2)/sum((actual-mean(actual))^2)))
}

calculate_r_2_for_feature <- function(data, feature) {
    n <- nrow(data)
    
    train_index <- sample(seq_len(n), size = 0.8*n)

    train <- data[train_index,]
    test <- data[-train_index,]
    
    this_formula = paste(feature,"~.")
    fit <- rpart(data=train, formula=as.formula(this_formula))

    y_test <- as.vector(test[[feature]])
    test[feature] <- NULL
    predictions <- predict(fit, test)
    return (calculate_r_2(y_test, predictions))
}

In [None]:
calculate_r_2_for_feature(customer_df,'Detergents_Paper')

In [None]:
print(paste("Delicatessen: ", calculate_r_2_for_feature(customer_df,'Delicatessen')))
print(paste("Degergents_paper: ", calculate_r_2_for_feature(customer_df,'Detergents_Paper')))
print(paste("Fresh: ", calculate_r_2_for_feature(customer_df,'Fresh')))
print(paste("Frozen: ", calculate_r_2_for_feature(customer_df,'Frozen')))
print(paste("Grocery: ", calculate_r_2_for_feature(customer_df,'Grocery')))
print(paste("Milk: ", calculate_r_2_for_feature(customer_df,'Milk')))

But this is subject to randomness. There is randomness in my `train_test_split`. Let's do the whole thing many times and take the average. 

In [None]:
mean_r2_for_feature <- function (data, feature) {
    scores = c()
    for (i in 1:100) {
        scores = c(scores, calculate_r_2_for_feature(data, feature))
    }
    
    return (mean(scores))
}

In [None]:
print(paste("Delicatessen: ", mean_r2_for_feature(customer_df,'Delicatessen')))
print(paste("Detergents_Paper: ", mean_r2_for_feature(customer_df,'Detergents_Paper')))
print(paste("Fresh: ", mean_r2_for_feature(customer_df,'Fresh')))
print(paste("Frozen: ", mean_r2_for_feature(customer_df,'Frozen')))
print(paste("Grocery: ", mean_r2_for_feature(customer_df,'Grocery')))
print(paste("Milk: ", mean_r2_for_feature(customer_df,'Milk')))

In [None]:
print(paste("Delicatessen: ", mean_r2_for_feature(customer_df,'Delicatessen')))
print(paste("Detergents_Paper: ", mean_r2_for_feature(customer_df,'Detergents_Paper')))
print(paste("Fresh: ", mean_r2_for_feature(customer_df,'Fresh')))
print(paste("Frozen: ", mean_r2_for_feature(customer_df,'Frozen')))
print(paste("Grocery: ", mean_r2_for_feature(customer_df,'Grocery')))
print(paste("Milk: ", mean_r2_for_feature(customer_df,'Milk')))

#### Discussion

What does this tell us?

## Visualize Redundancy

Study the correlation of the data.

In [None]:
pairs(customer_df)

In [None]:
cor(customer_df)

In [None]:
library(reshape2)
cormat = cor(customer_df)

In [None]:
cormat[lower.tri(cormat)] <- NA

In [None]:
melted_cormat <- melt(cormat, na.rm = T)

In [None]:
library(ggplot2)
ggplot(data = melted_cormat, aes(Var2, Var1, fill = value))+
 geom_tile(color = "white")+
 scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
   midpoint = 0, limit = c(-1,1), space = "Lab", 
   name="Pearson\nCorrelation") +
  theme_minimal()+ 
 theme(axis.text.x = element_text(angle = 45, vjust = 1, 
    size = 12, hjust = 1))+
 coord_fixed()