In [1]:
import findspark
findspark.init('/usr/local/spark')

In [60]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.context import SparkContext

import os
from pyspark.sql.functions import isnull, count, log10, col
import pyspark.sql.functions as F
import numpy as np

# Defining Spark Session and Context objects

In [3]:
sc = SparkContext.getOrCreate()
spark = SparkSession.builder.appName('ckpt2_spark').getOrCreate()

# Loading Data into Spark Session object

In [4]:
main_data_dir_path = os.path.abspath('Output_telecomData')
subdirectory_name = 'WithNaNs'

data_dir_path = os.path.join(main_data_dir_path, subdirectory_name)
assert os.path.exists(data_dir_path)

datafile_names = os.listdir(data_dir_path)
datafile_paths = [os.path.join(data_dir_path, datafile) for datafile in datafile_names]

datafile_paths

['/home/hduser/Deloitte_capstone_project/Output_telecomData/WithNaNs/Customer_account_info.csv',
 '/home/hduser/Deloitte_capstone_project/Output_telecomData/WithNaNs/Customer_Churn.csv',
 '/home/hduser/Deloitte_capstone_project/Output_telecomData/WithNaNs/Customer_demographics.csv',
 '/home/hduser/Deloitte_capstone_project/Output_telecomData/WithNaNs/Customer_services.csv']

In [70]:
cust_acc_df = spark.read.load(datafile_paths[0], format='csv', sep=',', inferSchema=True, header=True)
cust_churn_df = spark.read.load(datafile_paths[1], format='csv', sep=',', inferSchema=True, header=True)
cust_demo_df = spark.read.load(datafile_paths[2], format='csv', sep=',', inferSchema=True, header=True)
cust_serv_df = spark.read.load(datafile_paths[3], format='csv', sep=',', inferSchema=True, header=True)

In [71]:
cust_acc_df.printSchema()

root
 |-- customerID: integer (nullable = true)
 |-- Tenure: double (nullable = true)
 |-- Contract: string (nullable = true)
 |-- PaperlessBilling: string (nullable = true)
 |-- PaymentMethod: string (nullable = true)
 |-- MonthlyCharges: double (nullable = true)
 |-- TotalCharges: double (nullable = true)



In [72]:
cust_churn_df.printSchema()

root
 |-- customerID: integer (nullable = true)
 |-- Churn: string (nullable = true)



In [73]:
cust_demo_df.printSchema()

root
 |-- customerID: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- SeniorCitizen: double (nullable = true)
 |-- Partner: string (nullable = true)
 |-- Dependents: string (nullable = true)



In [74]:
cust_serv_df.printSchema()

root
 |-- customerID: integer (nullable = true)
 |-- PhoneService: string (nullable = true)
 |-- MultipleLines: string (nullable = true)
 |-- InternetService: string (nullable = true)
 |-- OnlineSecurity: string (nullable = true)
 |-- OnlineBackup: string (nullable = true)
 |-- DeviceProtection: string (nullable = true)
 |-- TechSupport: string (nullable = true)
 |-- StreamingTV: string (nullable = true)
 |-- StreamingMovies: string (nullable = true)



In [75]:
cust_churn_df.count()

34413

In [76]:
cust_acc_df.createOrReplaceTempView('acc_df')

# NaN Value Treatment

In [94]:
def get_nan_count(df, col_name):
    return df.filter(df[col_name]==np.nan).count()

def replace_nan_in_col(df, col_name, by_value=0):
    dtypes_dict = dict(df.dtypes)
    col_dtype = dtypes_dict[col_name]
    if col_dtype!='string':
        return df.replace(np.nan, by_value, col_name)
    return df.replace('NaN', by_value, col_name)

def get_col_mean(df, col_name):
    temp = df.replace(np.nan, 0)
    return temp.agg({col_name:'avg'}).collect()[0][0]

### Imputing Tenure and Monthly Charges features:
As shown in **task1.1** tenure and monthly charges both show normal-like distribution with piled up values at their respective highs and lows. 

<h4> Basic Treatment Strategy </h4> Mean imputation will work fine as the data is mostly normal like.<br>
<h4> Complex Treatment Strategy </h4> Can categorize the data in three parts, the lows, the highs and rest. We can use models like random forest to classify the NaNs of both of the features into these three categories, and finally after classification we can impute the the highs and lows with their respective values and the rests with mean. (<b>But I feel it's a bit overkill as NaN count is not that high</b>)

In [79]:
nan_count_tenure = get_nan_count(cust_acc_df, 'Tenure')
nan_count_monthlyc = get_nan_count(cust_acc_df, 'MonthlyCharges')

print("No. of NaN values in Tenure:", nan_count_tenure)
print("No. of NaN values in MonthlyCharges:", nan_count_monthlyc)

No. of NaN values in Tenure: 16
No. of NaN values in MonthlyCharges: 13


In [80]:
# Code for basic strategy imputation

mean_tenure = get_col_mean(cust_acc_df, 'Tenure')
cust_acc_df = replace_nan_in_col(cust_acc_df, 'Tenure', mean_tenure)

mean_monthlyc = get_col_mean(cust_acc_df, 'MonthlyCharges')
cust_acc_df = replace_nan_in_col(cust_acc_df, 'MonthlyCharges', mean_monthlyc)

## Imputing Total Charges feature:
As TotalCharges have a exponentially decreasing distribution, we can perform log transform on it to normalize the distribution.

<h4> Imputation Strategy: </h4> Replace with mean of transformed feature.

In [81]:
nan_count_totalc = get_nan_count(cust_acc_df, 'TotalCharges')

print("No. of NaN values in TotalCharges:", nan_count_totalc)

No. of NaN values in TotalCharges: 22


In [82]:
cust_acc_df = cust_acc_df.withColumn('logTotalCharges', log10(col('TotalCharges')))
mean_logtotalc = get_col_mean(cust_acc_df, 'logTotalCharges')
cust_acc_df = replace_nan_in_col(cust_acc_df, 'logTotalCharges', mean_logtotalc)

In [83]:
nan_count_logtotalc = get_nan_count(cust_acc_df, 'logTotalCharges')
print("No. of NaN values in logTotalCharges:", nan_count_logtotalc)

No. of NaN values in logTotalCharges: 0


## Imputing Contract feature:

In [88]:
spark.sql('SELECT CONTRACT, COUNT(*) CONTRACT_COUNT FROM ACC_DF GROUP BY CONTRACT').show()

+--------------+--------------+
|      CONTRACT|CONTRACT_COUNT|
+--------------+--------------+
|Month-to-month|         19693|
|      One year|          4890|
|      Two year|          9823|
|           NaN|             7|
+--------------+--------------+



In [98]:
cust_acc_df = replace_nan_in_col(cust_acc_df, 'Contract', 'Month-to-month')