In [None]:
# Installing PySpark
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# Importing necessary libraries
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import LogisticRegression, GBTClassifier, LinearSVC
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:

# Creating a SparkSession
spark = SparkSession.builder.appName("TelecomChurnPrediction").getOrCreate()

In [None]:

# Loading  the dataset
data = spark.read.csv("/content/telecom_dataset (1).csv", header=True, inferSchema=True)

# Data Preprocessing
data = data.dropna()  # Drop rows with missing values


In [None]:
data.show()

+----------+------+---+--------------+--------------+------------+-----+
|CustomerID|Gender|Age|      Contract|MonthlyCharges|TotalCharges|Churn|
+----------+------+---+--------------+--------------+------------+-----+
|         1|Female| 25|Month-to-Month|          65.7|       156.5|   No|
|         2|  Male| 37|      One Year|          89.0|      2356.8|   No|
|         3|  Male| 52|      Two Year|         115.5|      5408.6|   No|
|         4|Female| 30|Month-to-Month|          75.9|       129.4|  Yes|
|         5|  Male| 45|      One Year|          98.2|      3142.0|   No|
|         6|Female| 55|      Two Year|          99.9|      6541.5|   No|
|         7|  Male| 32|Month-to-Month|          82.1|       267.7|  Yes|
|         8|Female| 28|Month-to-Month|          61.5|       346.9|   No|
|         9|  Male| 48|      One Year|         101.8|      5149.6|  Yes|
|        10|Female| 60|      Two Year|         108.1|      6742.8|  Yes|
|        11|  Male| 42|Month-to-Month|          78.

In [None]:

# Calculating call duration (assuming call_start_time and call_end_time columns are present)
data = data.withColumn("call_duration", (col("TotalCharges") - col("MonthlyCharges")) / 60)

# Calculating average monthly spend
data = data.withColumn("average_monthly_spend", col("MonthlyCharges"))

# Display the updated dataset with new features
data.show()


+----------+------+---+--------------+--------------+------------+-----+------------------+---------------------+
|CustomerID|Gender|Age|      Contract|MonthlyCharges|TotalCharges|Churn|     call_duration|average_monthly_spend|
+----------+------+---+--------------+--------------+------------+-----+------------------+---------------------+
|         1|Female| 25|Month-to-Month|          65.7|       156.5|   No|1.5133333333333332|                 65.7|
|         2|  Male| 37|      One Year|          89.0|      2356.8|   No| 37.79666666666667|                 89.0|
|         3|  Male| 52|      Two Year|         115.5|      5408.6|   No| 88.21833333333333|                115.5|
|         4|Female| 30|Month-to-Month|          75.9|       129.4|  Yes|0.8916666666666667|                 75.9|
|         5|  Male| 45|      One Year|          98.2|      3142.0|   No|50.730000000000004|                 98.2|
|         6|Female| 55|      Two Year|          99.9|      6541.5|   No|            107.

In [None]:
data.printSchema()

root
 |-- CustomerID: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Contract: string (nullable = true)
 |-- MonthlyCharges: double (nullable = true)
 |-- TotalCharges: double (nullable = true)
 |-- Churn: string (nullable = true)
 |-- call_duration: double (nullable = true)
 |-- average_monthly_spend: double (nullable = true)



In [None]:
# Encodinging categorical variables
categorical_cols = ['Gender', 'Contract','Churn']
indexers = [StringIndexer(inputCol=col, outputCol=col+'_index').fit(data) for col in categorical_cols]
pipeline = Pipeline(stages=indexers)
dataset = pipeline.fit(data).transform(data)

In [None]:
# Feature scaling
assembler = VectorAssembler(inputCols=['Age', 'average_monthly_spend', 'call_duration', 'Gender_index'], outputCol='features')
dataset = assembler.transform(dataset)

scaler = StandardScaler(inputCol='features', outputCol='scaled_features')
scaler_model = scaler.fit(dataset)
dataset = scaler_model.transform(dataset)

In [None]:
# Splitting the data into training and testing sets
(train_data, test_data) = dataset.randomSplit([0.8, 0.2], seed=42)

In [None]:
# Model training and evaluation
lr = LogisticRegression(labelCol='Churn_index', featuresCol='scaled_features')

# Model selection and training
classifiers = [
    LogisticRegression(labelCol='Churn_index', featuresCol='scaled_features'),
    RandomForestClassifier(labelCol='Churn_index', featuresCol='scaled_features'),
    GBTClassifier(labelCol='Churn_index', featuresCol='scaled_features'),
    LinearSVC(labelCol='Churn_index', featuresCol='scaled_features')
]

# Defining the parameter grid for each classifier
paramGrids = [
    ParamGridBuilder()
        .addGrid(LogisticRegression.regParam, [0.1, 0.01])
        .addGrid(LogisticRegression.elasticNetParam, [0.0, 0.5, 1.0])
        .build(),
    ParamGridBuilder()
        .addGrid(RandomForestClassifier.numTrees, [10, 20, 30])
        .addGrid(RandomForestClassifier.featureSubsetStrategy, ['auto', 'sqrt'])
        .build(),
    ParamGridBuilder()
        .addGrid(GBTClassifier.maxDepth, [5, 10])
        .addGrid(GBTClassifier.maxIter, [20, 30])
        .build(),
    ParamGridBuilder()
        .addGrid(LinearSVC.maxIter, [10, 20])
        .addGrid(LinearSVC.regParam, [0.1, 0.01])
        .build()
]


evaluator = BinaryClassificationEvaluator(labelCol='Churn_index')

best_model = None
best_accuracy = 0.0

# Iterating over classifiers and parameter grids
for classifier, paramGrid in zip(classifiers, paramGrids):
    pipeline = Pipeline(stages=[classifier])
    crossval = CrossValidator(estimator=pipeline,
                              estimatorParamMaps=paramGrid,
                              evaluator=evaluator,
                              numFolds=5)
    cv_model = crossval.fit(train_data)

    # Model evaluation on test data
    predictions = cv_model.transform(test_data)
    accuracy = evaluator.evaluate(predictions)

    print(f"Accuracy for {classifier.__class__.__name__}: {accuracy}")

    if accuracy > best_accuracy:
        best_model = cv_model.bestModel
        best_accuracy = accuracy

# Getting the best model and its parameters
print("Best Model:")
print(best_model.stages[0])

# Use the best model for predictions
best_predictions = best_model.transform(test_data)

# Performing evaluation on the best model
best_accuracy = evaluator.evaluate(best_predictions)
print("Best Model Accuracy:", best_accuracy)

Accuracy for LogisticRegression: 0.5
Accuracy for RandomForestClassifier: 0.5
Accuracy for GBTClassifier: 0.3333333333333333
Accuracy for LinearSVC: 0.16666666666666666
Best Model:
LogisticRegressionModel: uid=LogisticRegression_91993ed58702, numClasses=2, numFeatures=4
Best Model Accuracy: 0.5


In [None]:
# Creating a MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol='Churn_index', predictionCol='prediction')

# Evaluating the best model on test data
accuracy = evaluator.evaluate(best_predictions, {evaluator.metricName: 'accuracy'})
precision = evaluator.evaluate(best_predictions, {evaluator.metricName: 'weightedPrecision'})
recall = evaluator.evaluate(best_predictions, {evaluator.metricName: 'weightedRecall'})
f1_score = evaluator.evaluate(best_predictions, {evaluator.metricName: 'f1'})

# Printing the evaluation results
print("Evaluation Metrics:")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1_score}")

Evaluation Metrics:
Accuracy: 0.2
Precision: 0.1
Recall: 0.2
F1-Score: 0.13333333333333333


**Documentation: Telecom Customer Churn Prediction**
1. **Introduction**
The objective of this project is to predict customer churn in a telecom company using machine learning techniques implemented in PySpark. By accurately identifying customers at risk of churning, the company can take proactive measures to retain them and improve overall business performance.

2. **Dataset**
The telecom customer dataset used in this project was loaded from the file "telecom_dataset.csv". It contains information about customer demographics, usage patterns, service plans, call details, customer complaints, and churn status.

3. **Preprocessing**
The dataset underwent several preprocessing steps:

**Missing Value Handling**: Rows with missing values were dropped using the dropna() method to ensure data integrity.
Feature Engineering: Additional features were created to capture relevant information. The call duration was calculated as the difference between the total charges and monthly charges divided by 60. The average monthly spend feature was created by copying the monthly charges.
4. **Feature Encoding and Scaling**
Categorical variables, including "Gender", "Contract", and "Churn", were encoded using StringIndexer to convert them into numerical values. Feature scaling was applied using the StandardScaler to ensure that all features are on a similar scale.

5. **Data Splitting**
The data was split into training and testing sets with a ratio of 80:20 using the randomSplit() method.

6. **Model Selection and Training**
Four machine learning models were considered for churn prediction: Logistic Regression, Random Forest Classifier, Gradient Boosting Tree Classifier, and Linear Support Vector Classifier. For each model, a parameter grid was defined to tune the hyperparameters. Cross-validation with 5 folds was performed to evaluate the models and select the best model based on accuracy.

7. **Model Evaluation**
The selected model's performance was evaluated using a BinaryClassificationEvaluator. The best model was used to make predictions on the test data, and its accuracy was evaluated using a MulticlassClassificationEvaluator. The evaluation metrics calculated were accuracy, precision, recall, and F1-score.

8. **Project Findings**
Logistic Regression achieved the highest accuracy of 0.5, outperforming the other algorithms.
The evaluation metrics (precision, recall, and F1-score) were low for the selected model, indicating room for improvement.
The accuracy of the best model was 0.2, which is significantly lower than the accuracy achieved during training.
The dataset and preprocessing steps might need further investigation to understand the reasons for low model performance.
9. **Challenges Faced**
Dealing with missing values required careful consideration of the best strategy, and in this case, dropping rows with missing values was chosen.
Selecting the most suitable machine learning algorithm and tuning hyperparameters were challenging due to the lack of insights into the dataset and the specific problem domain.
The low accuracy of the best model on the test data suggests potential issues with the data or feature engineering process.
10. **Lessons Learned**
Proper handling of missing values is crucial for accurate predictions, and alternative strategies like imputation can be explored.
Feature engineering plays a significant role in improving model performance, but it requires a deep understanding of the domain and relevant features.
Regular evaluation and comparison of different models using appropriate metrics help in selecting the best-performing model.
Understanding the underlying reasons for low model performance is essential for model improvement and future iterations of the project.
In conclusion, this project aimed to predict telecom customer churn using PySpark. Although the selected model did not perform well in terms of accuracy and evaluation metrics, the project provides valuable insights into the challenges and considerations involved in building churn prediction models. Further exploration and improvements can be made by refining the preprocessing steps, exploring additional features, and trying different machine learning algorithms and hyperparameter configurations.