<img src="https://www.teradata.com/Teradata/Images/Rebrand/Teradata_logo-two_color.png" alt="Teradata" width="400" align="right"/>

<br/>

# TELCO CHURN DEMO
# **PART 2: MODELLING (NAIVE BAYES)** 

In [1]:
%lsconnect

No connections defined. Use %addconnect to define a connection.


In [64]:
%connect demo

Success: 'demo' connection established


#### 1. Split data into training and testing sets

In [67]:
DROP TABLE TelcoChurn.model_dataset_train;

Success: 17 rows affected

In [68]:
CREATE MULTISET TABLE TelcoChurn.model_dataset_train
AS (
    SELECT
    customerid
    , token
    , category
    FROM TelcoChurn.model_dataset_raw
    SAMPLE 0.7)
WITH DATA;

Success: 0 rows affected

In [69]:
SELECT COUNT(*)
FROM TelcoChurn.model_dataset_train

Count(*)
753184


##### Note: Due to EXCEPT STATEMENT to create remaining TEST data SPOOL allotment must be modified by the DBC for each user

In [75]:
/* DROP TABLE TelcoChurn.model_dataset_test; */

Success: 0 rows affected

In [76]:
/* CREATE MULTISET TABLE TelcoChurn.model_dataset_test
AS (
    SELECT *
    FROM TelcoChurn.model_dataset_raw
    EXCEPT
    SELECT *
    FROM TelcoChurn.model_dataset_train)
WITH DATA;
*/

Success: 0 rows affected

In [77]:
SELECT COUNT(*)
FROM TelcoChurn.model_dataset_test

Count(*)
255660


#### 2a. Perform Naive Bayes on assocaited text tokens for known PATHS (TRAINING TABLE ONLY)

In [78]:
DROP TABLE TelcoChurn.csi_telco_churn_model;

Success: 17 rows affected

In [79]:
CREATE MULTISET TABLE TelcoChurn.csi_telco_churn_model 
AS (
  SELECT * FROM NaiveBayesTextClassifierTrainer (
    ON (
      SELECT * FROM NaiveBayesTextClassifierInternal (
        ON ( SELECT *
        FROM TelcoChurn.model_dataset_train
        ) AS "input" PARTITION BY category
        USING
        TokenColumn ('token')
       	ModelType ('Bernoulli')
        DocIDColumns ('customerid')
        DocCategoryColumn ('category')
      ) AS alias_1
    ) PARTITION BY 1
  ) AS alias_2 
)
WITH DATA;

Success: 0 rows affected

#### 2b.  Perform the Naive Bayes PREDICT function using the MLE on the TEST data

In [80]:
DROP TABLE TelcoChurn.csi_telco_churn_predict_test;

Success: 17 rows affected

In [81]:
CREATE TABLE TelcoChurn.csi_telco_churn_predict_test
AS
(SELECT *
    FROM NaiveBayesTextClassifierPredict@coprocessor (
    ON (SELECT *
        FROM TelcoChurn.model_dataset_test
    ) AS predicts 
    PARTITION BY customerid
    ON TelcoChurn.csi_telco_churn_model AS model DIMENSION
    USING
    InputTokenColumn ('token')
    ModelType ('Bernoulli')
    DocIDColumns ('customerid')
    TopK (2)
  ) AS dt
)
WITH DATA;

Success: 0 rows affected

In [82]:
SELECT
TOP 10 *
FROM TelcoChurn.csi_telco_churn_predict_test;

customerid,prediction,loglik
558168,NON CHURN,-7.749040059469703
673933,CHURN,-18.79156531806624
673933,NON CHURN,-7.8751827699268
493913,NON CHURN,-4.027077821614265
378617,CHURN,-17.273757728478063
378617,NON CHURN,-4.4731268381743305
493913,CHURN,-15.29505775659976
558168,CHURN,-13.75461271565261
417117,CHURN,-14.560237879639248
417117,NON CHURN,-4.030388153492816


#### 2 Confusion matrix

In [2]:
REPACE VIEW csi_telco_churn_results AS 
    SELECT P.customerid, P.prediction, P.loglik, T.category actual
        FROM csi_telco_churn_predict_test P 
        JOIN model_dataset_test T ON P.customerid = T.customerid;

ERROR: No active connection

#### 3. Create a confusion matrix to evaluate Naive Bayes model performance

In [89]:
DROP TABLE TelcoChurn.count_output;
DROP TABLE TelcoChurn.stat_output;
DROP TABLE TelcoChurn.acc_output;

Success: 16 rows affected

In [92]:
SELECT * FROM ConfusionMatrix (
  ON TelcoChurn.csi_telco_churn_results
    PARTITION BY 1
    OUT TABLE CountTable (TelcoChurn.count_output)
    OUT TABLE StatTable (TelcoChurn.stat_output)
    OUT TABLE AccuracyTable(TelcoChurn.acc_output)
    USING
    ObsColumn ('actual')
    PredictColumn ('prediction')
) AS dt;

message
Success !
The result has been outputted to output tables


In [93]:
SELECT * FROM TelcoChurn.count_output;

observation,0,1
1,21,17
0,153387,0


In [94]:
SELECT * FROM TelcoChurn.stat_output;

key,value
95% CI,"(0.9998, 0.9999)"
P-Value [Acc > NIR],0.0019
Mcnemar Test P-Value,0
Accuracy,0.9999
Null Error Rate,0.0002
Kappa,0.9665


In [95]:
SELECT * FROM TelcoChurn.acc_output;

measure,0,1
Specificity,0.4474,1.0
Neg Pred Value,1.0,0.9999
Detection Rate,0.9998,0.0001
Balanced Accuracy,0.7237,0.7237
Sensitivity,1.0,0.4474
Pos Pred Value,0.9999,1.0
Prevalence,0.9998,0.0002
Detection Prevalence,0.9999,0.0001
