Generating fake dataset


In [None]:
# Install dependencies
!pip install faker pandas

# Imports
import pandas as pd
import numpy as np
from faker import Faker
import random

fake = Faker()

# Set seed for reproducibility
Faker.seed(42)
np.random.seed(42)
random.seed(42)

# Parameters
num_devices = 500
num_towers = 50
num_accounts = 1000
num_defaulters = 50
num_records = 10000

# Generate timestamps
timestamps = pd.date_range('2023-01-01', periods=num_records, freq='T')

# Generate Device-Tower connection logs
device_ids = np.random.choice(range(10000, 10000 + num_devices), size=num_records, replace=True)
tower_ids = np.random.choice([f'TWR_{i}' for i in range(100, 100 + num_towers)], size=num_records, replace=True)

tower_logs = pd.DataFrame({
    'timestamp': timestamps,
    'device_id': device_ids,
    'tower_id': tower_ids
})


# Save tower logs
tower_logs.to_csv('fake_tower_logs.csv', index=False)

# Generate Account to Device Mapping
account_ids = np.arange(20000, 20000 + num_accounts)
mapped_device_ids = np.random.choice(device_ids, size=num_accounts, replace=True)

account_device_mapping = pd.DataFrame({
    'account_id': account_ids,
    'device_id': mapped_device_ids
})

# Save account-device mapping
account_device_mapping.to_csv('fake_account_device_mapping.csv', index=False)

# Select random defaulters
defaulter_accounts = random.sample(list(account_ids), num_defaulters)
defaulters_df = pd.DataFrame({
    'account_id': defaulter_accounts,
    'is_defaulter': [1]*num_defaulters
})

# Save defaulters
defaulters_df.to_csv('fake_defaulters.csv', index=False)

# Display files to download (Colab)
from google.colab import files

print("Generated files:")
print("- fake_tower_logs.csv")
print("- fake_account_device_mapping.csv")
print("- fake_defaulters.csv")

# Optional: Auto-download generated files
files.download('fake_tower_logs.csv')
files.download('fake_account_device_mapping.csv')
files.download('fake_defaulters.csv')


Collecting faker
  Downloading faker-37.4.2-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.4.2-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-37.4.2
Generated files:
- fake_tower_logs.csv
- fake_account_device_mapping.csv
- fake_defaulters.csv


  timestamps = pd.date_range('2023-01-01', periods=num_records, freq='T')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import pandas as pd
import numpy as np
from faker import Faker
from google.colab import files

fake = Faker()
Faker.seed(42)
np.random.seed(42)

# Load existing fake_tower_logs.csv
tower_logs = pd.read_csv('fake_tower_logs.csv')

# Get unique tower IDs
unique_towers = tower_logs['tower_id'].dropna().unique()

# Generate random latitude and longitude for each tower
tower_locations = pd.DataFrame({
    'tower_id': unique_towers,
    'latitude': np.random.uniform(12.0, 37.0, size=len(unique_towers)),   # within India approx. lat range
    'longitude': np.random.uniform(68.0, 97.0, size=len(unique_towers))   # within India approx. lon range
})

# Merge locations back to tower_logs
tower_logs_with_loc = pd.merge(tower_logs, tower_locations, on='tower_id', how='left')

# Save new CSV
tower_logs_with_loc.to_csv('fake_tower_logs_with_locations.csv', index=False)

# Auto-download the updated CSV file
files.download('fake_tower_logs_with_locations.csv')

print("✅ Enhanced file generated: fake_tower_logs_with_locations.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Enhanced file generated: fake_tower_logs_with_locations.csv


sorting soln  no ML needed


In [None]:
# Install dependencies
!pip install pandas

# Import required libraries
import pandas as pd
from google.colab import files

# Load datasets
tower_logs = pd.read_csv('fake_tower_logs_with_locations.csv')
account_device_mapping = pd.read_csv('fake_account_device_mapping.csv')
defaulters = pd.read_csv('fake_defaulters.csv')

# Filter for defaulter accounts
defaulter_accounts = defaulters['account_id'].unique()
defaulter_devices = account_device_mapping[account_device_mapping['account_id'].isin(defaulter_accounts)]

# Merge defaulter devices with tower logs
defaulter_logs = pd.merge(defaulter_devices, tower_logs, on='device_id', how='inner')

# Sort logs to find last-known tower per defaulter
defaulter_logs_sorted = defaulter_logs.sort_values(by=['account_id', 'timestamp'])

# Extract last known location per defaulter (simple heuristic)
last_known_locations = defaulter_logs_sorted.groupby('account_id').last().reset_index()

# Keep only relevant columns
output = last_known_locations[['account_id', 'device_id', 'timestamp', 'tower_id', 'latitude', 'longitude']]

# Save inferred locations to CSV
output.to_csv('defaulter_last_known_locations.csv', index=False)

# Download output file
files.download('defaulter_last_known_locations.csv')

# Display result
print("✅ Last known locations of defaulters:")
print(output.head())




<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Last known locations of defaulters:
   account_id  device_id            timestamp tower_id   latitude  longitude
0       20006      10223  2023-01-07 14:06:00  TWR_118  35.767858  90.478852
1       20025      10095  2023-01-07 12:48:00  TWR_141  16.991845  90.366840
2       20027      10161  2023-01-07 18:16:00  TWR_111  13.452090  70.566283
3       20030      10208  2023-01-07 14:36:00  TWR_117  36.247746  75.869122
4       20032      10074  2023-01-07 06:28:00  TWR_129  29.105826  93.729170


In [None]:
# Install required libraries
!pip install pandas numpy tensorflow scikit-learn
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from google.colab import files

# Load datasets
tower_logs = pd.read_csv('fake_tower_logs_with_locations.csv')
account_device_mapping = pd.read_csv('fake_account_device_mapping.csv')
defaulters = pd.read_csv('fake_defaulters.csv')

# Merge defaulters and tower logs
defaulter_devices = account_device_mapping[account_device_mapping['account_id'].isin(defaulters['account_id'])]
logs = pd.merge(defaulter_devices, tower_logs, on='device_id')

# Sort by timestamp
logs_sorted = logs.sort_values(by=['account_id', 'timestamp'])

# Encode tower IDs
le = LabelEncoder()
logs_sorted['tower_num'] = le.fit_transform(logs_sorted['tower_id'])

# Prepare sequences per defaulter
sequences = logs_sorted.groupby('account_id')['tower_num'].apply(list)

# Prepare training data
seq_length = 5  # lookback length (number of past tower connections)
X, y = [], []
for seq in sequences:
    if len(seq) > seq_length:
        for i in range(len(seq) - seq_length):
            X.append(seq[i:i+seq_length])
            y.append(seq[i+seq_length])

X = pad_sequences(X, maxlen=seq_length)
y = np.array(y)

# Define LSTM model
model = Sequential([
    Embedding(input_dim=len(le.classes_), output_dim=64, input_length=seq_length),
    LSTM(128, activation='relu'),
    Dense(len(le.classes_), activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X, y, epochs=10, batch_size=32)

# Predict last location for each defaulter
def predict_last_tower(seq):
    seq = pad_sequences([seq[-seq_length:]], maxlen=seq_length)
    pred_num = np.argmax(model.predict(seq))
    return le.inverse_transform([pred_num])[0]

# Generate predictions
results = []
for acc, seq in sequences.items():
    predicted_tower = predict_last_tower(seq)
    tower_info = tower_logs[tower_logs['tower_id'] == predicted_tower].iloc[0]
    results.append({
        'account_id': acc,
        'predicted_tower_id': predicted_tower,
        'latitude': tower_info['latitude'],
        'longitude': tower_info['longitude']
    })

# Save predictions
predictions_df = pd.DataFrame(results)
predictions_df.to_csv('defaulter_predicted_locations.csv', index=False)

# Download results
files.download('defaulter_predicted_locations.csv')

print("✅ Predicted last-known locations using LSTM:")
print(predictions_df.head())

# Save LSTM model
model.save("lstm_tower_predictor.h5")

# Save tower label encoder for decoding predictions later
import pickle
with open("lstm_label_encoder.pkl", "wb") as f:
    pickle.dump(le, f)



Epoch 1/10




[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.0228 - loss: 3.9113
Epoch 2/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.0536 - loss: 3.9039
Epoch 3/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.0363 - loss: 3.8912
Epoch 4/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.0355 - loss: 3.8592
Epoch 5/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.0403 - loss: 3.8027
Epoch 6/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.0483 - loss: 3.7643
Epoch 7/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.0876 - loss: 3.7088
Epoch 8/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.0956 - loss: 3.5644
Epoch 9/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Predicted last-known locations using LSTM:
   account_id predicted_tower_id   latitude  longitude
0       20006            TWR_112  21.363503  96.117954
1       20025            TWR_124  15.050956  71.468233
2       20027            TWR_111  13.452090  70.566283
3       20030            TWR_140  21.159046  91.648381
4       20032            TWR_104  15.487347  73.762755




GNN method

In [None]:
# Install PyTorch Geometric and dependencies
!pip install torch torchvision torchaudio
!pip install torch-geometric
!pip install torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-2.6.0+cu124.html

# Install other dependencies
!pip install pandas numpy scikit-learn


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

Collecting torch-geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-geometric
Successfully installed torch-geometric-2.6.1
Looking in links: https://data.pyg.org/whl/torch-2.6.0+cu124.html
Collecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-2.6.0%2Bcu124/torch_scatter-2.1.2%2Bpt26cu124-cp311-cp311-linux_x86_64.whl (10.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m120.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch-sparse
  Downloading https://

In [None]:
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import LabelEncoder
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv
import torch.nn.functional as F
from google.colab import files

print("✅ Libraries imported successfully.")


✅ Libraries imported successfully.


In [None]:
# Load datasets
tower_logs = pd.read_csv('fake_tower_logs_with_locations.csv')
account_device_mapping = pd.read_csv('fake_account_device_mapping.csv')
defaulters = pd.read_csv('fake_defaulters.csv')

# Filter logs for defaulters only
defaulter_devices = account_device_mapping[account_device_mapping['account_id'].isin(defaulters['account_id'])]
logs = pd.merge(defaulter_devices, tower_logs, on='device_id')

# Sort logs chronologically
logs_sorted = logs.sort_values(by=['device_id', 'timestamp'])

# Label encode towers
tower_encoder = LabelEncoder()
logs_sorted['tower_num'] = tower_encoder.fit_transform(logs_sorted['tower_id'])

# Prepare node features (latitude, longitude)
tower_features = logs_sorted[['tower_num', 'latitude', 'longitude']].drop_duplicates().sort_values('tower_num')
x = torch.tensor(tower_features[['latitude', 'longitude']].values, dtype=torch.float)

# Prepare edges (movement between towers)
edge_list = []
for device, group in logs_sorted.groupby('device_id'):
    towers_visited = group['tower_num'].tolist()
    edge_list.extend([(towers_visited[i], towers_visited[i+1]) for i in range(len(towers_visited)-1)])

edge_index = torch.tensor(edge_list, dtype=torch.long).t().contiguous()

# Graph Data object
data = Data(x=x, edge_index=edge_index)

# Define GraphSAGE model
class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        return x

# Initialize model
num_nodes = x.shape[0]
model = GraphSAGE(in_channels=2, hidden_channels=32, out_channels=num_nodes)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

# Prepare training data (Predict next tower)
train_src, train_dst = edge_index[0], edge_index[1]

# Train the GNN
model.train()
for epoch in range(30):
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = criterion(out[train_src], train_dst)
    loss.backward()
    optimizer.step()
    if (epoch+1)%5 == 0:
        print(f'Epoch {epoch+1}, Loss: {loss.item():.4f}')

# Prediction for Defaulters
model.eval()
results = []
for acc, group in logs_sorted.groupby('account_id'):
    last_tower_num = group.iloc[-1]['tower_num']
    out = model(data.x, data.edge_index)
    predicted_next_tower_num = out[last_tower_num].argmax().item()
    predicted_tower_id = tower_encoder.inverse_transform([predicted_next_tower_num])[0]

    tower_info = tower_logs[tower_logs['tower_id'] == predicted_tower_id].iloc[0]
    results.append({
        'account_id': acc,
        'predicted_tower_id': predicted_tower_id,
        'latitude': tower_info['latitude'],
        'longitude': tower_info['longitude']
    })

# Save predictions
pred_df = pd.DataFrame(results)
pred_df.to_csv('gnn_defaulter_predicted_locations.csv', index=False)
files.download('gnn_defaulter_predicted_locations.csv')

print("✅ GNN predictions completed successfully:")
print(pred_df.head())

# Save GNN model state
torch.save(model.state_dict(), "gnn_tower_predictor.pth")

# Save tower label encoder
with open("gnn_label_encoder.pkl", "wb") as f:
    pickle.dump(tower_encoder, f)


Epoch 5, Loss: 32.1283
Epoch 10, Loss: 18.0013
Epoch 15, Loss: 21.2641
Epoch 20, Loss: 17.1472
Epoch 25, Loss: 11.0439
Epoch 30, Loss: 9.0585


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ GNN predictions completed successfully:
   account_id predicted_tower_id   latitude  longitude
0       20006            TWR_130  23.003812  81.694233
1       20025            TWR_122  25.118911  91.263712
2       20027            TWR_122  25.118911  91.263712
3       20030            TWR_106  16.545624  76.147101
4       20032            TWR_130  23.003812  81.694233
