<a href="https://colab.research.google.com/github/Rozieyati/Data-Science-Project/blob/main/STQD6134_GroupA_Project1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

STQD6134 : Business Analytics Project 1 - Group A

Scenario:
You are a Business Analyst at Streamify, a digital streaming company that offers on-demand video content through monthly and yearly subscriptions.

Problem Statement:

1. To  understand how different customer segments and subscription plans affect revenue and customer retention

2. To perform an analysis on subscription data from the last year and provide insights on:

  *   Subscription and cancellation trends
  *   Revenue performance by plan type and region
  *   Customer engagement metrics

In [1]:
#data simulation, to generate dataset with sample size 2000
n <- 2000

#Attributes
CustomerID <- paste0("C", sprintf("%04d", 1:n))
JoinDate <- sample(seq(as.Date('2024-01-01'),as.Date('2024-12-31'), by="day"), n, replace = TRUE)
ActiveMonths <- sample(1:12, n, replace = TRUE)
library(dplyr)

CancelDate <- if_else(runif(n) < 0.25, JoinDate + ActiveMonths*30, as.Date(NA))   # 25% cancellations
Region <- sample(c("North", "South", "East", "West"), n, replace = TRUE)
SubscriptionType <- sample(c("Basic", "Standard", "Premium"), n, replace = TRUE, prob=c(0.4, 0.35, 0.25))
MonthlyFee <- ifelse(SubscriptionType == "Basic", 10,
                     ifelse(SubscriptionType == "Standard", 20, 30))
TotalStreams <- round(rnorm(n, mean=150, sd=60))
DeviceType <- sample(c("Mobile", "Smart TV", "Laptop", "Tablet"), n, replace = TRUE)
PaymentMethod <- sample(c("Card", "Online Wallet", "NetBanking"), n, replace = TRUE)
stream_data <- data.frame(CustomerID, JoinDate, CancelDate, Region, SubscriptionType, MonthlyFee,
                          ActiveMonths, TotalStreams, DeviceType, PaymentMethod)
stream_data$Revenue <- stream_data$MonthlyFee * stream_data$ActiveMonths
#head(stream_data)
#write.csv(stream_data, "stream_data.csv", row.names = FALSE)  #simulated dataset
getwd()




Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




In [2]:
# Importing CSV using read.csv
data1 <- read.csv("stream_data.csv", header=TRUE, stringsAsFactors=TRUE)
head(data1)
str(data1)

#Task 1 - Preprocessing

#check & handle missing value
na_counts <- colSums(is.na(data1))
print(na_counts) #check no. of missing value in each attribute

data1$CancelDate <- as.Date(as.character(data1$CancelDate))
data1$CancelDate[is.na(data1$CancelDate)] <- as.Date("2030-12-31") #replace NA with future date #change data type to Date
na_counts <- colSums(is.na(data1))
print(na_counts) #check no. of missing value


#convert data type
str(data1)
data1$JoinDate <- as.Date(data1$JoinDate, origin = "1970-01-01")  #change the data type to Date


#create new variables
data1$IsActive <- data1$CancelDate == as.Date("2030-12-31")  #TRUE if CancelDate=NA which have been replaced by FutureDate
library(lubridate)
data1$MonthJoined <- month(data1$JoinDate, label = TRUE, abbr = TRUE)  #return Months in Mmm format in order.

#head(data1)
str(data1)

#Task 2 - Business Metric Calculations
#Total Revenue

#Average Revenue per User (ARPU)

#Revenue by Subscription Type

#Churn Rate - % of customers who cancelled during the year.

#Regional Revenue - Total revenue by region.

#Average Engagement (Streams per Active Month) - Average number of videos watched per month by customers.

#Monthly Join Trend - Number of new customers joining per month.

#Device Usage Breakdown - Most common devices used

#Task 3 - Visualization
# Revenue by Subscription Type
# Revenue by Region
# Monthly Join Trend
# Device Usage
# TotalStreams distribution (to show engagement)


“cannot open file 'stream_data.csv': No such file or directory”


ERROR: Error in file(file, "rt"): cannot open the connection


# 2. Business Metric Calculations

In [3]:
## (1.) Total Revenue - Total Revenue generated during the year.

TotalRevenue <- sum(stream_data$Revenue, na.rm = TRUE)
TotalRevenue


In [4]:
## (2.) Average Revenue per User (ARPU) – Average revenue per customer.

ARPU <- mean(stream_data$Revenue, na.rm = TRUE)
ARPU

In [5]:
## (3.) Revenue by Subscription Type - Compare revenue generated by Basic,
## Standard, and Premium Plans.

RevenueByType <- aggregate(Revenue ~ SubscriptionType,
                           data = stream_data, sum)
RevenueByType



SubscriptionType,Revenue
<chr>,<dbl>
Basic,52400
Premium,96660
Standard,88440


In [6]:
## (4.) Churn Rate - % of customers who cancelled during the year.

ChurnRate <- sum(!is.na(stream_data$CancelDate)) / nrow(stream_data)
ChurnRate

In [7]:
## (5.) Regional Revenue - Total revenue by region.

RegionalRevenue <- aggregate(Revenue ~ Region,
                             data = stream_data, sum)
RegionalRevenue

Region,Revenue
<chr>,<dbl>
East,62620
North,57790
South,59700
West,57390


In [8]:
## (6.) Average Engagement (Streams per Active Month) - Average number of videos
## watched per month by customers.

stream_data$Engagement <- stream_data$TotalStreams / stream_data$ActiveMonths
AvgEngagement <- mean(stream_data$Engagement, na.rm = TRUE)
AvgEngagement

In [9]:
## (7.) Monthly Join Trend - Number of new customers joining per months.

stream_data$MonthJoined <- format(stream_data$JoinDate, "%m")
MonthlyJoinTrend <- table(stream_data$MonthJoined)
MonthlyJoinTrend


 01  02  03  04  05  06  07  08  09  10  11  12 
172 154 169 169 180 147 166 171 169 180 163 160 

In [10]:
## (8.) Device Usage Breakdown - Most common devices used to access the
## platform.

DeviceUsage <- table(stream_data$DeviceType)
DeviceUsage


  Laptop   Mobile Smart TV   Tablet 
     497      513      487      503 