<a href="https://colab.research.google.com/github/WKhisa/Machine-Learning-using-PySpark/blob/main/Machine_Learning_using_PySpark_IPP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Install Pyspark
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285387 sha256=ac001e60db545cb1d126a260a6a833048ab5c52840dc7579a6ee51d96b69b397
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


In [2]:
#load prerequisite libraries
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import MinMaxScaler, VectorAssembler

In [6]:
#Create Spark session
spark = SparkSession.builder.appName("telecomdata").getOrCreate()

In [12]:
# Download, upload and load dataset
df = spark.read.csv('telecom_dataset.csv',header=True, inferSchema=True)
df.show()

+----------+------+---+--------------+--------------+------------+-----+
|CustomerID|Gender|Age|      Contract|MonthlyCharges|TotalCharges|Churn|
+----------+------+---+--------------+--------------+------------+-----+
|         1|Female| 25|Month-to-Month|          65.7|       156.5|   No|
|         2|  Male| 37|      One Year|          89.0|      2356.8|   No|
|         3|  Male| 52|      Two Year|         115.5|      5408.6|   No|
|         4|Female| 30|Month-to-Month|          75.9|       129.4|  Yes|
|         5|  Male| 45|      One Year|          98.2|      3142.0|   No|
|         6|Female| 55|      Two Year|          99.9|      6541.5|   No|
|         7|  Male| 32|Month-to-Month|          82.1|       267.7|  Yes|
|         8|Female| 28|Month-to-Month|          61.5|       346.9|   No|
|         9|  Male| 48|      One Year|         101.8|      5149.6|  Yes|
|        10|Female| 60|      Two Year|         108.1|      6742.8|  Yes|
|        11|  Male| 42|Month-to-Month|          78.

In [13]:
# Define the numerical columns
numerical_columns = ['Age', 'MonthlyCharges', 'TotalCharges']

# Convert string columns to numerical
for column in numerical_columns:
    df = df.withColumn(column, col(column).cast('double'))

# Assemble the numerical columns into a vector column
assembler = VectorAssembler(inputCols=numerical_columns, outputCol="features")
assembled_df = assembler.transform(df)

# Create a MinMaxScaler object
scaler = MinMaxScaler(inputCol="features", outputCol="scaled_features")

# Fit the scaler to the data
scaler_model = scaler.fit(assembled_df)

# Transform the data
scaled_df = scaler_model.transform(assembled_df)

# View the scaled data
scaled_df.show()

+----------+------+----+--------------+--------------+------------+-----+-------------------+--------------------+
|CustomerID|Gender| Age|      Contract|MonthlyCharges|TotalCharges|Churn|           features|     scaled_features|
+----------+------+----+--------------+--------------+------------+-----+-------------------+--------------------+
|         1|Female|25.0|Month-to-Month|          65.7|       156.5|   No|  [25.0,65.7,156.5]|[0.0,0.0777777777...|
|         2|  Male|37.0|      One Year|          89.0|      2356.8|   No| [37.0,89.0,2356.8]|[0.34285714285714...|
|         3|  Male|52.0|      Two Year|         115.5|      5408.6|   No|[52.0,115.5,5408.6]|[0.77142857142857...|
|         4|Female|30.0|Month-to-Month|          75.9|       129.4|  Yes|  [30.0,75.9,129.4]|[0.14285714285714...|
|         5|  Male|45.0|      One Year|          98.2|      3142.0|   No| [45.0,98.2,3142.0]|[0.57142857142857...|
|         6|Female|55.0|      Two Year|          99.9|      6541.5|   No| [55.0,