In [0]:
from pyspark.sql.functions import col, when, avg, min, max, count

df = spark.table("workspace.default.telco")

display(df) 

1. Create a dataframe containing 'customerID' and 'MonthlyCharges'

In [0]:
cust_monthly_df = df.select("customerID", "MonthlyCharges")
display(cust_monthly_df)

2. Show on a pie chart the proportions of possible types of 'Contract'

In [0]:
contract_counts = (
    df.groupBy("Contract")
      .agg(count("*").alias("num_customers"))
)

display(contract_counts)

import matplotlib.pyplot as plt

pdf = contract_counts.toPandas()

plt.figure(figsize=(6, 6))
plt.pie(
    pdf["num_customers"],
    labels=pdf["Contract"],
    autopct="%1.1f%%",
    startangle=90
)
plt.title("Proportions of Contract Types")
plt.axis("equal")
plt.show()

3. Get the number of customers that have Multiple Lines. Show the result as value of a single variable

In [0]:
multiple_lines_count = df.filter(col("MultipleLines") == "Yes").count()
multiple_lines_count

4. What are the minimum and maximum values of tenure? Show these as single variables

In [0]:
tenure_stats = df.agg(
    min("tenure").alias("min_tenure"),
    max("tenure").alias("max_tenure")
).collect()[0]

min_tenure = tenure_stats["min_tenure"]
max_tenure = tenure_stats["max_tenure"]

min_tenure, max_tenure

5. What are the monthly charges depending on the gender of the customer?

In [0]:
monthly_by_gender = (
    df.groupBy("gender")
      .agg(
          avg("MonthlyCharges").alias("avg_monthly_charges"),
          count("*").alias("num_customers")
      )
)

display(monthly_by_gender)

6. Do customers who have a yearly contract pay on average more than the others? Return the result as a single bool variable. Please note that others in this context means that we treat together both the Two year and month-to-month customers

In [0]:
avg_yearly = (
    df.filter(col("Contract") == "One year")
      .agg(avg("MonthlyCharges").alias("avg_monthly"))
      .collect()[0]["avg_monthly"]
)

avg_others = (
    df.filter(col("Contract") != "One year")
      .agg(avg("MonthlyCharges").alias("avg_monthly"))
      .collect()[0]["avg_monthly"]
)

yearly_pays_more = avg_yearly > avg_others
yearly_pays_more

7. Create a column that will show the average charges computed as a division of Total Charges by tenure

In [0]:
df = df.withColumn(
    "avg_charges",
    when(col("tenure") > 0,
         col("TotalCharges") / col("tenure")
    ).otherwise(None)
)

display(df.select("customerID", "tenure", "TotalCharges", "avg_charges"))