In [None]:
## Churn for High Net individuals on prepaid using usage based churn 

# imports 
from pyspark.sql import SparkSession
rom pyspark.sql import SparkSession
spark = SparkSession.builder.appName('sumad_test').getOrCreate()

from pyspark.sql import functions as F

from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
#from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

import pandas as pd 
import numpy as np 
from matplotlib import pyplot as plt
%matplotlib notebook 
plt.style.use('bmh')

## 1. Data Preparation
#### Read Data, Data Quality Check

In [None]:
#### had to pull data to local file system and then read

df_raw = spark.read.format("csv")\
 .option("header", "true")\
 .option("inferSchema", "true")\
 .load("/user/sumad/telecom_churn_data.csv")\
 .coalesce(5)
 #.where("Description IS NOT NULL")

In [None]:
df_raw.count()

len(df_raw.columns)

In [None]:
df_sample = df_raw.limit(5).toPandas()

df_sample.to_csv('sample_telco.csv')

#### Filter high networth individuals

In [None]:
df_raw_1 = df_raw.withColumn('avg_rech_months_6_7', 
                             (F.col('total_rech_amt_6') + F.col('total_rech_amt_7'))/2)
th_amt = df_raw_1.select('avg_rech_months_6_7').summary('70%').collect()

df_raw_2 = (df_raw_1.withColumn('HNI_Ind',F.when(F.col('avg_rech_months_6_7') >= float(th_amt[0][1]),1)
                                .otherwise(0)).filter(F.col('HNI_Ind') == 1))
df_raw_2.count()

In [None]:
#### Create churn indicator

df_raw_3 = df_raw_2.withColumn('Churn', F.when(((F.col("total_ic_mou_9") == 0) &
(F.col("total_og_mou_9") == 0) &
(F.col("vol_2g_mb_9") == 0) &
(F.col("vol_3g_mb_9") == 0)),1).otherwise(0))

df_raw_3.groupBy('Churn').count().show()


In [None]:
##### Churn rate of 9% in overall data can be seen

#### Drop all month 9 variables

month_9_cols = [x for x in df_raw_3.columns if(x.endswith('9'))]
print(len(df_raw_3.columns))
print(len(month_9_cols))

cols_to_keep_1 = [x for x in df_raw_3.columns if(x not in month_9_cols)]

df_raw_4 = df_raw_3.select(cols_to_keep_1)

len(df_raw_4.columns)

In [None]:
#### Data Quality Check

id_cols = ['mobile_number', 'circle_id' ]
target_col = 'Churn'
cat_cols = [x for x in df_raw_4.columns if(('date' in x) & (x not in id_cols) & (x!=target_col))]
num_cols = [x for x in df_raw_4.columns if x not in (cat_cols + id_cols + [target_col])]


print(len(cat_cols))
print(len(num_cols))

num_summary = df_raw_4.select(num_cols).summary().toPandas()

num_summary.to_csv('num_summary.csv')

cat_summary = df_raw_4.select(cat_cols).summary().toPandas()

cat_summary

#### Fix data issues

In [None]:
#### Remove numerical variables that 
- have any missing values 
- have 0 standard deviation 
#### Remove all date columns

num_summary.head()

num_summary_ = num_summary.set_index('summary')

num_summary_.head()

mask = (num_summary_.loc['stddev'] != '0.0') | (num_summary_.loc['count'] == '30011')

sum(mask)

num_cols_sub = list(num_summary_.columns[mask].values)

#num_cols_sub

def drop_null_columns(df):
    """
    This function drops all columns which contain null values.
    :param df: A PySpark DataFrame
    """
    null_counts = df.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in df.columns]).collect()[0].asDict()
    to_drop = [k for k, v in null_counts.items() if v > 0]
    df = df.drop(*to_drop)
    return df

# Drops column b2, because it contains null values
final_df = drop_null_columns(df_raw_4)

len(final_df.columns)

#### 2. Feature Engineering 

In [None]:
final_df = final_df.withColumn('arpu_6_7', 
                             (F.col('arpu_6') + F.col('arpu_7')))
final_features_1 = [x for x in (final_df.columns ) if ((x not in (id_cols + [target_col])) & 
                                                     (x in num_cols_sub))]
len(final_features)

In [None]:
#### Split into training and test set 

from pyspark.ml.feature import VectorAssembler

#final_features_1 = [x for x in final_features if (x not in (id_cols + [target_col]))]

#final_features_1

assembler = VectorAssembler(inputCols= final_features,outputCol="features")
output = assembler.transform(df_raw_4).select('features', 'Churn')
output.show(5)
train_data,test_data = output.randomSplit([0.7,0.3])

In [None]:
#### 3. Model Training  
- Model categories to be considered  
- Handling class imbalance 
- Parameter tuning using Cross Validation , choice of eval metric

from pyspark.ml.evaluation import BinaryClassificationEvaluator

dtc_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                       labelCol='Churn')
reg_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                       labelCol='Churn')


log_reg = LogisticRegression(featuresCol='features',labelCol='Churn')

paramGrid_reg = ParamGridBuilder()\
    .addGrid(log_reg.regParam, [0.1, 0.01]) \
    .build()

crossval_reg = CrossValidator(estimator=log_reg,
                          estimatorParamMaps=paramGrid_reg,
                          evaluator=reg_eval,
                          numFolds=5, parallelism = 2)
cvModel_reg = crossval_reg.fit(train_data)


In [None]:
dtc = DecisionTreeClassifier(labelCol='Churn',featuresCol='features')

#dtc.explainParams()

dtc = DecisionTreeClassifier(labelCol='Churn',featuresCol='features')

paramGrid_dtc = ParamGridBuilder()\
    .addGrid(dtc.maxDepth, [2,5])\
    .build()

crossval_dtc = CrossValidator(estimator=dtc,
                          estimatorParamMaps=paramGrid_dtc,
                          evaluator=dtc_eval,
                          numFolds=5, parallelism = 2)
cvModel_dtc = crossval_dtc.fit(train_data)

In [None]:
#### 4. Model Evaluation on test
- Use a single metric 
- Visualize where which model is better

result_dtc = cvModel_dtc.transform(test_data)
result_reg = cvModel_reg.transform(test_data)

dtc_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                       labelCol='Churn')
reg_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                       labelCol='Churn')

print(AUC_reg, AUC_dtc) = SparkSession.builder.appName('sumad_test').getOrCreate()