In [None]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=b1ca7abf027f16d12ebc36dde6ee9db337fc7284bb771217d9e026d4f06c2316
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [None]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, concat, lit, array, array_contains
from pyspark.ml.fpm import FPGrowth

spark = SparkSession.builder \
    .appName("FP-Growth Demographic Analysis") \
    .getOrCreate()

file1 = '/content/drive/MyDrive/Telco_customer_churn.csv'
file2 = '/content/drive/MyDrive/Telco_customer_churn_demographics.csv'


df1 = spark.read.csv(file1, header=True, inferSchema=True)
df2 = spark.read.csv(file2, header=True, inferSchema=True)

df2 = df2.drop('count', 'dependents', 'gender', 'senior citizen')
merged_df = df1.join(df2, on='CustomerID', how='outer')

merged_file = '/content/drive/MyDrive/merged_file.csv'
merged_df.write.csv(merged_file, header=True, mode='overwrite')

df = spark.read.csv(merged_file, header=True, inferSchema=True)
df = df.withColumn("High Churn Risk", when(col("Churn Score") > 50, "High Churn Risk").otherwise("Low Churn Risk"))

df = df.withColumn("Senior Citizen", when(col("Senior Citizen") == 1, "Senior").otherwise("Non-Senior"))
df = df.withColumn("Gender", col("Gender"))

for column in ['High Churn Risk', 'Senior Citizen', 'Gender', 'Age', 'Married', 'Number of Dependents', 'Under 30']:
    df = df.withColumn(column, concat(lit(column + ': '), col(column).cast('string')))

df = df.select("High Churn Risk", "Senior Citizen", "Gender", "Age", "Married", "Number of Dependents", "Under 30").na.drop()
df = df.withColumn("features", array("High Churn Risk", "Senior Citizen", "Gender", "Age", "Married", "Number of Dependents", "Under 30"))

fp_growth = FPGrowth(itemsCol="features", minSupport=0.05, minConfidence=0.1)
model = fp_growth.fit(df)


frequent_itemsets = model.freqItemsets
association_rules = model.associationRules.filter(
    (col("lift") > 1) &
    (array_contains(col("consequent"), "High Churn Risk: High Churn Risk") |
     array_contains(col("consequent"), "High Churn Risk: Low Churn Risk"))
)

print("Frequent Itemsets:")
frequent_itemsets.show(truncate=False)

print("Specific Association Rules with Churn Risk in Consequent:")
association_rules.show(truncate=False)


spark.stop()

Frequent Itemsets:
+-------------------------------------------------------------------------------------------------------------------------------+----+
|items                                                                                                                          |freq|
+-------------------------------------------------------------------------------------------------------------------------------+----+
|[High Churn Risk: Low Churn Risk]                                                                                              |2596|
|[High Churn Risk: Low Churn Risk, Number of Dependents: 0]                                                                     |1811|
|[High Churn Risk: Low Churn Risk, Number of Dependents: 0, Under 30: No]                                                       |1444|
|[High Churn Risk: Low Churn Risk, Number of Dependents: 0, Under 30: No, Senior Citizen: Non-Senior]                           |1444|
|[High Churn Risk: Low Churn Risk, N