## ETL(Extract, Transform, Load) + Data Cleaning

In [2]:
%pip install pyspark

Collecting pyspark
  Using cached pyspark-3.5.5.tar.gz (317.2 MB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting py4j==0.10.9.7 (from pyspark)
  Using cached py4j-0.10.9.7-py2.py3-none-any.whl.metadata (1.5 kB)
Using cached py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py): started
  Building wheel for pyspark (setup.py): still running...
  Building wheel for pyspark (setup.py): finished with status 'done'
  Created wheel for pyspark: filename=pyspark-3.5.5-py2.py3-none-any.whl size=317747965 sha256=f29fefcc93cc0273be420be3fa927d9b83e4c9b60f3ba3d55582b390d3ceb8bb
  Stored in directory: c:\users\berat\appdata\local\pip\cache\wheels\8f\cb\c0\cc57eb1bf0f9dc87cdaf2b0dbac49e58a210ff68d21d6fc709
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.7 pyspark-3.5.5
Note: you may need to restart 

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when

### Start SparkSession

In [4]:
spark = SparkSession.builder.appName("ChurnETL").getOrCreate()

### Load the dataset

In [5]:
df = spark.read.csv("WA_Fn-UseC_-Telco-Customer-Churn.csv", header=True, inferSchema=True)

### Clean TotalCharges column

In [12]:
df = df.replace(" ", None, subset = ['TotalCharges'])
df = df.withColumn('TotalCharges', when(col('TotalCharges').isNotNull(), col('TotalCharges').cast('float')).otherwise(None))
df = df.na.drop()

### Create binary label column from Churn

In [13]:
df = df.withColumn("label", when(col("Churn") == "Yes", 1).otherwise(0))

### Drop irrelevant columns

In [14]:
df = df.drop("customerID", "Churn")

### Save cleaned DataFrame to Parquet

In [17]:
# df.write.mode("overwrite").csv("cleaned_churn.csv", header=True)
# df.write.mode("overwrite").parquet("cleaned_churn.parquet")

### Show

In [18]:
df.show(5)

+------+-------------+-------+----------+------+------------+----------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------------+----------------+--------------------+--------------+------------+-----+
|gender|SeniorCitizen|Partner|Dependents|tenure|PhoneService|   MultipleLines|InternetService|OnlineSecurity|OnlineBackup|DeviceProtection|TechSupport|StreamingTV|StreamingMovies|      Contract|PaperlessBilling|       PaymentMethod|MonthlyCharges|TotalCharges|label|
+------+-------------+-------+----------+------+------------+----------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------------+----------------+--------------------+--------------+------------+-----+
|Female|            0|    Yes|        No|     1|          No|No phone service|            DSL|            No|         Yes|              No|         No|         No|             No|Month-to-month|     

### Stop

In [19]:
spark.stop()