<a href="https://colab.research.google.com/github/Silvio-0-1/ey-technical-training/blob/main/Telecom_Network_Quality.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
# Initialize Spark Session
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("TelecomNetworkQuality") \
    .getOrCreate()

In [72]:
# from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, TimestampType
from pyspark.sql.window import Window
from pyspark.sql.functions import trim, col, when, to_date, sum as spark_sum, avg, desc, rank, lit, coalesce, isnull,try_to_timestamp,regexp_extract,initcap

## **PHASE 1: INGESTION**


In [58]:
# 1. Read network_logs.csv as all StringType
schema_string = StructType([
    StructField("event_id", StringType(), True),
    StructField("subscriber_id", StringType(), True),
    StructField("tower_id", StringType(), True),
    StructField("city", StringType(), True),
    StructField("network_type", StringType(), True),
    StructField("signal_strength", StringType(), True),
    StructField("download_speed_mbps", StringType(), True),
    StructField("upload_speed_mbps", StringType(), True),
    StructField("latency_ms", StringType(), True),
    StructField("call_drop", StringType(), True),
    StructField("event_time", StringType()),
    StructField("device_type", StringType(), True)
])

df_raw = spark.read.csv("/content/network_logs.csv", header=True, schema=schema_string)

In [59]:
# 2. Print schema and row count
df_raw.printSchema()
print(f"Raw Row Count: {df_raw.count()}")

root
 |-- event_id: string (nullable = true)
 |-- subscriber_id: string (nullable = true)
 |-- tower_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- network_type: string (nullable = true)
 |-- signal_strength: string (nullable = true)
 |-- download_speed_mbps: string (nullable = true)
 |-- upload_speed_mbps: string (nullable = true)
 |-- latency_ms: string (nullable = true)
 |-- call_drop: string (nullable = true)
 |-- event_time: string (nullable = true)
 |-- device_type: string (nullable = true)

Raw Row Count: 180000


In [60]:
# 3. Show sample rows
df_raw.show(5)

+--------+-------------+--------+---------+------------+---------------+-------------------+-----------------+----------+---------+-------------------+------------+
|event_id|subscriber_id|tower_id|     city|network_type|signal_strength|download_speed_mbps|upload_speed_mbps|latency_ms|call_drop|         event_time| device_type|
+--------+-------------+--------+---------+------------+---------------+-------------------+-----------------+----------+---------+-------------------+------------+
| E100000|        S5975|    T837|Bangalore|          3G|        invalid|               NULL|             NULL|   invalid|      YES|01/01/2026 00:00:00|     Android|
| E100001|        S3537|    T283|Hyderabad|          5G|            -83|             124.07|            41.26|       114|       NO|2026-01-01 00:00:03|FeaturePhone|
| E100002|        S1629|    T877|     Pune|          4G|            -72|              41.01|             3.36|       221|       NO|2026-01-01 00:00:06|FeaturePhone|
| E100003|

## **PHASE 2: CLEANING**

In [61]:
# 1. Trim string columns
df_trimmed = df_raw.select([F.trim(F.col(c)).alias(c) for c in df_raw.columns])

df_trimmed.show(5)

+--------+-------------+--------+---------+------------+---------------+-------------------+-----------------+----------+---------+-------------------+------------+
|event_id|subscriber_id|tower_id|     city|network_type|signal_strength|download_speed_mbps|upload_speed_mbps|latency_ms|call_drop|         event_time| device_type|
+--------+-------------+--------+---------+------------+---------------+-------------------+-----------------+----------+---------+-------------------+------------+
| E100000|        S5975|    T837|Bangalore|          3G|        invalid|               NULL|             NULL|   invalid|      YES|01/01/2026 00:00:00|     Android|
| E100001|        S3537|    T283|Hyderabad|          5G|            -83|             124.07|            41.26|       114|       NO|2026-01-01 00:00:03|FeaturePhone|
| E100002|        S1629|    T877|     Pune|          4G|            -72|              41.01|             3.36|       221|       NO|2026-01-01 00:00:06|FeaturePhone|
| E100003|

In [62]:
# 2. Normalize string fields (city, network_type, device_type, call_drop)
df_normalized = df_trimmed \
    .withColumn("city", F.lower(F.col("city"))) \
    .withColumn("network_type", F.upper(F.col("network_type"))) \
    .withColumn("device_type", F.lower(F.col("device_type"))) \
    .withColumn("call_drop", F.upper(F.col("call_drop")))

df_normalized.show(5)

+--------+-------------+--------+---------+------------+---------------+-------------------+-----------------+----------+---------+-------------------+------------+
|event_id|subscriber_id|tower_id|     city|network_type|signal_strength|download_speed_mbps|upload_speed_mbps|latency_ms|call_drop|         event_time| device_type|
+--------+-------------+--------+---------+------------+---------------+-------------------+-----------------+----------+---------+-------------------+------------+
| E100000|        S5975|    T837|bangalore|          3G|        invalid|               NULL|             NULL|   invalid|      YES|01/01/2026 00:00:00|     android|
| E100001|        S3537|    T283|hyderabad|          5G|            -83|             124.07|            41.26|       114|       NO|2026-01-01 00:00:03|featurephone|
| E100002|        S1629|    T877|     pune|          4G|            -72|              41.01|             3.36|       221|       NO|2026-01-01 00:00:06|featurephone|
| E100003|

In [63]:
# 3. Clean numeric fields safely (Invalid values become null)
df_cleaned = df_normalized \
    .withColumn("signal_strength_clean", F.col("signal_strength").cast(IntegerType())) \
    .withColumn("download_speed_clean", F.col("download_speed_mbps").cast(DoubleType())) \
    .withColumn("upload_speed_clean", F.col("upload_speed_mbps").cast(DoubleType())) \
    .withColumn("latency_clean", F.col("latency_ms").cast(IntegerType()))

df_cleaned.printSchema()

root
 |-- event_id: string (nullable = true)
 |-- subscriber_id: string (nullable = true)
 |-- tower_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- network_type: string (nullable = true)
 |-- signal_strength: string (nullable = true)
 |-- download_speed_mbps: string (nullable = true)
 |-- upload_speed_mbps: string (nullable = true)
 |-- latency_ms: string (nullable = true)
 |-- call_drop: string (nullable = true)
 |-- event_time: string (nullable = true)
 |-- device_type: string (nullable = true)
 |-- signal_strength_clean: integer (nullable = true)
 |-- download_speed_clean: double (nullable = true)
 |-- upload_speed_clean: double (nullable = true)
 |-- latency_clean: integer (nullable = true)



In [66]:
df_cleaned.describe()

DataFrame[summary: string, event_id: string, subscriber_id: string, tower_id: string, city: string, network_type: string, signal_strength: string, download_speed_mbps: string, upload_speed_mbps: string, latency_ms: string, call_drop: string, event_time: string, device_type: string, signal_strength_clean: string, download_speed_clean: string, upload_speed_clean: string, latency_clean: string]

In [75]:

from pyspark.sql.functions import expr, coalesce

df_cleaned_date = df_cleaned.withColumn(
    "event_time_new",
    coalesce(
        expr(to_date(try_to_timestamp(col("event_time"), 'yyyy-MM-dd'))),
        expr(to_date(try_to_timestamp(col("event_time"), 'dd/MM/yyyy'))),
        expr(to_date(try_to_timestamp(col("event_time"), 'yyyy/MM/dd')))
    )
)

df_cleaned_date.show()


PySparkTypeError: [NOT_ITERABLE] Column is not iterable.

## **PHASE 3: VALIDATION**

In [34]:
# 1. Count invalid values for each numeric field (where result is null but original wasn't null/empty)
# 2. Count invalid timestamps
invalid_counts = df_cleaned.select(
    F.sum(F.when(F.col("signal_strength_clean").isNull() & F.col("signal_strength").isNotNull(), 1).otherwise(0)).alias("invalid_signal"),
    F.sum(F.when(F.col("download_speed_clean").isNull() & F.col("download_speed_mbps").isNotNull(), 1).otherwise(0)).alias("invalid_download"),
    F.sum(F.when(F.col("upload_speed_clean").isNull() & F.col("upload_speed_mbps").isNotNull(), 1).otherwise(0)).alias("invalid_upload"),
    F.sum(F.when(F.col("latency_clean").isNull() & F.col("latency_ms").isNotNull(), 1).otherwise(0)).alias("invalid_latency"),
    F.sum(F.when(F.col("event_time_clean").isNull() & F.col("event_time").isNotNull(), 1).otherwise(0)).alias("invalid_time")
)
print("Invalid Value Counts:")
invalid_counts.show()

Invalid Value Counts:


{"ts": "2026-01-19 09:13:15.362", "level": "ERROR", "logger": "DataFrameQueryContextLogger", "msg": "[CAST_INVALID_INPUT] The value 'invalid' of the type \"STRING\" cannot be cast to \"INT\" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018", "context": {"file": "line 4 in cell [29]", "line": "", "fragment": "cast", "errorClass": "CAST_INVALID_INPUT"}, "exception": {"class": "Py4JJavaError", "msg": "An error occurred while calling o880.showString.\n: org.apache.spark.SparkNumberFormatException: [CAST_INVALID_INPUT] The value 'invalid' of the type \"STRING\" cannot be cast to \"INT\" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018\n== DataFrame ==\n\"cast\" was called from\nline 4 in cell [29]\n\n\tat org.apache.spark.sql.errors.QueryExecution

NumberFormatException: [CAST_INVALID_INPUT] The value 'invalid' of the type "STRING" cannot be cast to "INT" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018
== DataFrame ==
"cast" was called from
line 4 in cell [29]


In [None]:
# 3. Remove duplicate logs based on event_id
df_deduped = df_cleaned.dropDuplicates(["event_id"])
print(f"Count after deduplication: {df_deduped.count()}")

## **PHASE 4: NETWORK KPIS**

In [None]:
# Helper: Create a numeric flag for call drop for easy averaging
df_kpi_base = df_deduped.withColumn("is_call_drop", F.when(F.col("call_drop") == "YES", 1).otherwise(0))

# 1. Average download speed per city
# 2. Average latency per city
# 3. Call drop rate per city
city_kpis = df_kpi_base.groupBy("city").agg(
    F.avg("download_speed_clean").alias("avg_download_speed"),
    F.avg("latency_clean").alias("avg_latency"),
    (F.sum("is_call_drop") / F.count("*")).alias("call_drop_rate")
)
print("KPIs per City:")
city_kpis.show()

In [None]:
# 4. Call drop rate per tower
tower_kpis = df_kpi_base.groupBy("tower_id", "city").agg(
    F.avg("download_speed_clean").alias("avg_download_speed"),
    F.avg("latency_clean").alias("avg_latency"),
    (F.sum("is_call_drop") / F.count("*")).alias("call_drop_rate")
)

In [None]:
# 5. Identify top 10 worst towers
# Criteria: High Drop Rate, High Latency, Low Download Speed
# We can order by these columns descending/ascending respectively.
worst_towers = tower_kpis.orderBy(
    F.desc("call_drop_rate"),
    F.desc("avg_latency"),
    F.asc("avg_download_speed")
).limit(10)

print("Top 10 Worst Towers:")
worst_towers.show()

## **PHASE 5: CUSTOMER EXPERIENCE**

In [None]:
# Compute metrics for each subscriber_id
subscriber_stats = df_kpi_base.groupBy("subscriber_id").agg(
    F.count("event_id").alias("total_events"),
    F.avg("download_speed_clean").alias("sub_avg_download"),
    F.avg("latency_clean").alias("sub_avg_latency"),
    F.sum("is_call_drop").alias("call_drop_count")
)

# 5. Identify subscribers with poor experience
# Logic defined: High drops (>3), Low speed (<5 Mbps), OR High latency (>100ms) - Example thresholds
poor_experience_subs = subscriber_stats.filter(
    (F.col("call_drop_count") > 3) |
    (F.col("sub_avg_download") < 5.0) |
    (F.col("sub_avg_latency") > 100)
)

print("Sample Subscribers with Poor Experience:")
poor_experience_subs.show(5)

## **PHASE 6: WINDOW FUNCTIONS**

In [None]:
# 1. Rank towers within each city by call drop rate
window_city_tower = Window.partitionBy("city").orderBy(F.desc("call_drop_rate"))

tower_ranks = tower_kpis.withColumn("rank_in_city", F.rank().over(window_city_tower))
print("Towers Ranked by Drop Rate (per City):")
tower_ranks.show(5)

In [None]:
# 2. Rank subscribers within each city by worst experience
# We need city info at subscriber level. Joining back or carrying city in aggregation.
# For simplicity, let's assume we aggregate by subscriber AND city (assuming subs stay in one city mostly)
sub_city_stats = df_kpi_base.groupBy("subscriber_id", "city").agg(
    F.sum("is_call_drop").alias("call_drop_count")
)
window_city_sub = Window.partitionBy("city").orderBy(F.desc("call_drop_count"))

sub_ranks = sub_city_stats.withColumn("worst_exp_rank", F.rank().over(window_city_sub))
print("Subscribers Ranked by Worst Experience (per City):")
sub_ranks.show(5)

In [None]:
# 3. Use lag() to detect sudden deterioration in signal_strength for a tower
# We need time-series data per tower
window_tower_time = Window.partitionBy("tower_id").orderBy("event_time_clean")

df_signal_lag = df_kpi_base.withColumn(
    "prev_signal",
    F.lag("signal_strength_clean").over(window_tower_time)
)

# Detect if signal dropped by more than 10 dBm compared to previous log
df_signal_deterioration = df_signal_lag.withColumn(
    "signal_drop",
    F.col("prev_signal") - F.col("signal_strength_clean")
).filter(F.col("signal_drop") > 10)

print("Events with Sudden Signal Deterioration:")
df_signal_deterioration.select("tower_id", "event_time_clean", "signal_strength_clean", "prev_signal", "signal_drop").show(5)

## **PHASE 7: ANOMALY DETECTION**

In [None]:
# Detect towers where: Latency spikes, Download speed drops, Call drops spike
# Using Window functions and LAG

df_anomalies = df_kpi_base.withColumn(
    "prev_latency", F.lag("latency_clean").over(window_tower_time)
).withColumn(
    "prev_speed", F.lag("download_speed_clean").over(window_tower_time)
).withColumn(
    "prev_drops", F.lag("is_call_drop").over(window_tower_time) # lagging the binary flag
)

# Logic for Anomaly:
# 1. Latency spikes: Current > 2x Previous
# 2. Speed drops: Current < 0.5x Previous
# 3. Call drops spike: Current is drop, previous was NOT drop (transition to failure)
anomalous_events = df_anomalies.filter(
    (F.col("latency_clean") > 2 * F.col("prev_latency")) |
    (F.col("download_speed_clean") < 0.5 * F.col("prev_speed")) |
    ((F.col("is_call_drop") == 1) & (F.col("prev_drops") == 0))
)

print("Anomalous Tower Events:")
anomalous_events.select("tower_id", "event_time_clean", "latency_clean", "prev_latency", "download_speed_clean", "is_call_drop").show(5)