### Some more Ad-hoc Analysis using SQL and then we'll focus on Retention and Churn Rate Analytics in SQL itself:

In [0]:
# Loading required libraries if needed
from pyspark.sql.functions import *
from pyspark.sql.window import Window
import pyspark.sql.functions as F
from pyspark.sql.functions import to_date, current_timestamp

#### User table analytics:
##### 1. Language + platform matrix: inds language-platform affinity patterns

In [0]:
%sql
USE CATALOG inshort_cata;
USE SCHEMA gold;

SELECT 
    lang, 
    platform, 
    COUNT(*) as users
FROM users 
WHERE is_active = TRUE
GROUP BY lang, platform
ORDER BY users DESC
--LIMIT 15;
;

lang,platform,users
en,ANDROID,28338
en,IOS,7463
hi,ANDROID,6504
hi,IOS,203


##### 2. Temporal and Seasonal Analytics: Identifies peak install times

In [0]:
%sql
SELECT 
    DAYOFWEEK(install_dt) as day_of_week,
    HOUR(install_dt) as install_hour,
    COUNT(*) as installs
FROM users 
WHERE is_active = TRUE
GROUP BY DAYOFWEEK(install_dt), HOUR(install_dt)
ORDER BY install_hour, day_of_week

day_of_week,install_hour,installs
1,0,5392
2,0,6836
3,0,7313
4,0,7222
5,0,5235
6,0,5358
7,0,5152


##### 3. Monthly Growth Rate (MoM growth)

In [0]:
%sql
SELECT 
    DATE_TRUNC('month', install_dt) as month,
    COUNT(*) as monthly_users,
    LAG(COUNT(*), 1) OVER (ORDER BY DATE_TRUNC('month', install_dt)) as prev_month,
    CASE 
        WHEN LAG(COUNT(*), 1) OVER (ORDER BY DATE_TRUNC('month', install_dt)) IS NOT NULL THEN
            ROUND(
                ((COUNT(*) - LAG(COUNT(*), 1) OVER (ORDER BY DATE_TRUNC('month', install_dt))) * 100.0) / 
                LAG(COUNT(*), 1) OVER (ORDER BY DATE_TRUNC('month', install_dt)), 
                2
            )
        ELSE NULL 
    END as growth_pct
FROM users 
WHERE is_active = TRUE
GROUP BY DATE_TRUNC('month', install_dt)
ORDER BY month;

-- NOTE: prev_month data is null because we have data just for 1 month right now

month,monthly_users,prev_month,growth_pct
2024-01-01T00:00:00.000Z,42508,,


##### 4. User Segmentation and Profiling:

In [0]:
%sql
SELECT 
    CASE 
        WHEN district IS NULL OR district = 'N/A' THEN 'Nationwide'
        ELSE 'Regional'
    END as user_segment,
    platform, 
    lang,
    COUNT(*) as users,
    COUNT(DISTINCT campaign_id) as campaigns_used
FROM users 
WHERE is_active = TRUE
GROUP BY 
    CASE 
        WHEN district IS NULL OR district = 'N/A' THEN 'Nationwide'
        ELSE 'Regional'
    END,
    platform, 
    lang
ORDER BY users DESC

user_segment,platform,lang,users,campaigns_used
Nationwide,ANDROID,en,27240,8
Nationwide,IOS,en,7253,3
Regional,ANDROID,hi,5908,6
Regional,ANDROID,en,1098,5
Nationwide,ANDROID,hi,596,4
Regional,IOS,en,210,0
Nationwide,IOS,hi,133,1
Regional,IOS,hi,70,0


##### 5. High-Value User Profiles:

In [0]:
%sql
SELECT 
    platform, 
    lang,
    AVG(DATEDIFF(current_date(), install_dt)) as avg_age_days,
    COUNT(DISTINCT district) as geo_diversity
FROM users 
WHERE is_active = TRUE
GROUP BY platform, lang
ORDER BY avg_age_days DESC;

platform,lang,avg_age_days,geo_diversity
ANDROID,hi,684.7948954489544,501
ANDROID,en,683.7903521772885,295
IOS,en,683.3663406136942,56
IOS,hi,682.5911330049261,45


#### Content table based analytics:

##### 1. Content Publishing Analytics:

In [0]:
%sql
SELECT 
    MONTH(DATE_TRUNC('month', createdAt)) as publish_month,
    COUNT(*) as articles_published
FROM content 
WHERE is_active = TRUE
GROUP BY MONTH(DATE_TRUNC('month', createdAt))
ORDER BY publish_month;

publish_month,articles_published
1,24227
2,25852
3,27573
4,26633
5,457
6,405
7,462
8,466
9,500
10,549


In [0]:
%sql
-- Peak publishing hours
SELECT 
    HOUR(createdAt) as publish_hour,
    COUNT(*) as articles
FROM content 
WHERE is_active = TRUE
GROUP BY HOUR(createdAt)
ORDER BY articles DESC

publish_hour,articles
11,11686
10,10463
9,9192
8,8748
12,8578
6,8519
7,8088
5,6745
13,6730
14,6176


##### 2. Language Distribuion Analytics

In [0]:
%sql
SELECT 
    newsLanguage,
    COUNT(*) as articles,
    COUNT(DISTINCT author) as authors_contributing
FROM content 
WHERE is_active = TRUE
GROUP BY newsLanguage
ORDER BY articles DESC;

newsLanguage,articles,authors_contributing
english,87905,420
hindi,25437,49
,178,45


##### 3. Content ID Pattern Analysis:

In [0]:
%sql
SELECT 
    SUBSTRING(_id, 1, 8) as id_prefix,
    COUNT(*) as content_count
FROM content 
WHERE is_active = TRUE
GROUP BY SUBSTRING(_id, 1, 8)
ORDER BY content_count DESC
;

-- Came as a result while trying to find duplicate values per ids, then noticed the id values pattern and did a bit of research on it.

id_prefix,content_count
nlccbwk9,3
c0wjvc3h,3
i1nhhgfd,3
fnengdz4,3
ndk0c2wc,3
c2g9gkrr,3
avchwsws,3
yapi0xyv,3
rooewctd,3
huggr0lx,3


##### 4. Summary

In [0]:
%sql
SELECT 
    COUNT(*) as total_articles,
    COUNT(DISTINCT author) as total_authors,
    COUNT(DISTINCT newsLanguage) as languages,
    MIN(createdAt) as first_article,
    MAX(createdAt) as last_article,
    AVG(DATEDIFF(current_date(), createdAt)) as avg_age_days
FROM content 
WHERE is_active = TRUE;

total_articles,total_authors,languages,first_article,last_article,avg_age_days
113520,497,2,2015-07-23T18:09:57.000Z,2024-05-03T04:47:25.000Z,668.9296599718111
