Load and Feature Engineering

In [2]:
import pandas as pd 

df=pd.read_csv('cleaned_network_data.csv')
df.columns

Index(['timestamp', 'tower_id', 'latency_sec', 'bandwidth', 'dropped_calls',
       'total_calls', 'uptime_percent', 'network_type', 'operator',
       'users_connected', 'download_speed_mbps', 'signal_strength_dbm',
       'tower_load_percent', 'average_call_duration_sec',
       'handover_success_rate', 'packet_loss_percent', 'jitter_ms',
       'tower_temperature_c', 'battery_backup_hours', 'tower_age_years',
       'maintenance_due', 'upload_speed_mbps', 'call_drop_reason',
       'weather_condition', 'technician_notes', 'last_maintenance',
       'tower_color', 'is_test_tower', 'tower_height_m', 'signal_icon',
       'internal_code', 'notes', 'extra_flag', 'location.latitude',
       'location.longitude', 'signal_strength.RSSI', 'signal_strength.RSRP',
       'signal_strength.SINR', 'voip_metrics.jitter_ms',
       'voip_metrics.packet_loss_percent', 'call_drop_rate', 'bandwidth_mbps'],
      dtype='object')

Drop unrequierd rows

In [3]:
irrelevant_cols = [
    "weather_condition",
    "technician_notes",
    "last_maintenance",
    "tower_color",
    "is_test_tower",
    "tower_height_m",
    "signal_icon",
    "internal_code",
    "notes",
    "extra_flag"
]

df = df.drop(columns=irrelevant_cols)

In [5]:
df.columns

Index(['timestamp', 'tower_id', 'latency_sec', 'bandwidth', 'dropped_calls',
       'total_calls', 'uptime_percent', 'network_type', 'operator',
       'users_connected', 'download_speed_mbps', 'signal_strength_dbm',
       'tower_load_percent', 'average_call_duration_sec',
       'handover_success_rate', 'packet_loss_percent', 'jitter_ms',
       'tower_temperature_c', 'battery_backup_hours', 'tower_age_years',
       'maintenance_due', 'upload_speed_mbps', 'call_drop_reason',
       'location.latitude', 'location.longitude', 'signal_strength.RSSI',
       'signal_strength.RSRP', 'signal_strength.SINR',
       'voip_metrics.jitter_ms', 'voip_metrics.packet_loss_percent',
       'call_drop_rate', 'bandwidth_mbps'],
      dtype='object')

In [19]:
def needs_optimization_relaxed(row):
    conditions = [
        row['uptime_percent'] < 95,
        row['dropped_calls'] / max(row['total_calls'], 1) > 0.1,
        row['download_speed_mbps'] < 5 or row['upload_speed_mbps'] < 2,
        row['packet_loss_percent'] > 3 or row['jitter_ms'] > 18,
        row['signal_strength.RSSI'] < -105 or row['signal_strength.RSRP'] < -125 or row['signal_strength.SINR'] < -5
    ]
    # Only require 2 or more violations to flag
    return sum(conditions) >= 2

df['needs_optimization'] = df.apply(needs_optimization_relaxed, axis=1)

In [20]:
df.head()

Unnamed: 0,timestamp,tower_id,latency_sec,bandwidth,dropped_calls,total_calls,uptime_percent,network_type,operator,users_connected,...,location.latitude,location.longitude,signal_strength.RSSI,signal_strength.RSRP,signal_strength.SINR,voip_metrics.jitter_ms,voip_metrics.packet_loss_percent,call_drop_rate,bandwidth_mbps,needs_optimization
0,2025-08-22 00:00:00,TWR1062,0.363,64.56 Gbps,3,129,97.15,5G,Three,478,...,57.144657,-2.092696,-110.41,-135.7,29.61,19.47,0.76,2.325581,64560.0,False
1,2025-08-22 00:05:00,TWR1077,0.11,81.71 Mbps,9,118,97.7,4G,O2,308,...,52.493125,-1.897314,-73.55,-93.07,12.72,46.3,3.71,7.627119,81.71,False
2,2025-08-22 00:10:00,TWR1056,0.965,98.17 Mbps,0,69,96.56,4G,EE,272,...,53.408641,-2.987669,-91.22,-82.46,21.31,38.92,1.36,0.0,98.17,False
3,2025-08-22 00:15:00,TWR1043,0.364,12.41 Gbps,8,191,99.28,4G,Vodafone UK,88,...,51.744629,-1.264011,-99.08,-90.35,-9.89,12.17,0.89,4.188482,12410.0,True
4,2025-08-22 00:20:00,TWR1062,0.211,98.31 Gbps,3,69,97.56,5G,O2,15,...,53.408299,-2.98309,-73.74,-115.27,24.99,11.5,2.02,4.347826,98310.0,False


In [21]:
df[df['needs_optimization'] == True].shape


(2485, 33)

In [22]:
df[df['needs_optimization'] == False].shape

(6515, 33)

Now we'll Train the model with RandomForest

In [None]:
from sklearn.models import RandomForestClassifier