In [17]:
import pandas as pd
import numpy as np
from collections import defaultdict, Counter
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from datetime import datetime
import matplotlib.pyplot as plt


In [18]:
# Load your dataframe
df = pd.read_csv('/content/user_transactions_july2025.csv')  # replace with your file path

# Convert timestamp to datetime
df['timestamp'] = pd.to_datetime(df['timestamp'], format="%d-%m-%Y %H:%M")

# Sort by user and time
df = df.sort_values(by=['user_id', 'timestamp'])

# Extract IP prefix (first 3 numbers)
df['ip_prefix'] = df['ip_address'].apply(lambda x: '.'.join(x.split('.')[:3]))

# Extract time features
df['hour'] = df['timestamp'].dt.hour
df['weekday'] = df['timestamp'].dt.weekday

# Optional time bucket
def time_bucket(hour):
    if 5 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 17:
        return 'Afternoon'
    elif 17 <= hour < 21:
        return 'Evening'
    else:
        return 'Night'

df['time_bucket'] = df['hour'].apply(time_bucket)

In [19]:
# Transitions: current_ip_prefix -> next_ip_prefix
transitions = defaultdict(lambda: defaultdict(int))

# Build transitions
for user_id, user_df in df.groupby('user_id'):
    ip_seq = user_df['ip_prefix'].tolist()
    for i in range(1, len(ip_seq)):
        prev_ip = ip_seq[i-1]
        curr_ip = ip_seq[i]
        transitions[prev_ip][curr_ip] += 1

# Normalize to probabilities
probs = {}
for from_ip, to_dict in transitions.items():
    total = sum(to_dict.values())
    probs[from_ip] = {to_ip: count/total for to_ip, count in to_dict.items()}


In [20]:
def predict_next_ip(current_ip_prefix):
    if current_ip_prefix in probs:
        return max(probs[current_ip_prefix], key=lambda x: probs[current_ip_prefix][x])
    else:
        return None


In [21]:
second_order_transitions = defaultdict(lambda: defaultdict(int))

for user_id, user_df in df.groupby('user_id'):
    ip_seq = user_df['ip_prefix'].tolist()
    for i in range(2, len(ip_seq)):
        prev_pair = (ip_seq[i-2], ip_seq[i-1])
        curr_ip = ip_seq[i]
        second_order_transitions[prev_pair][curr_ip] += 1

# Normalize
second_order_probs = {}
for from_pair, to_dict in second_order_transitions.items():
    total = sum(to_dict.values())
    second_order_probs[from_pair] = {to_ip: count/total for to_ip, count in to_dict.items()}


In [22]:
# Simple clustering example based on user stats
user_stats = df.groupby('user_id').agg({
    'ip_prefix': pd.Series.nunique,
    'amount': 'std',
    'ip_address': 'count'
}).fillna(0)

user_stats.columns = ['unique_ips', 'amount_std', 'num_txns']

# Fit KMeans
kmeans = KMeans(n_clusters=3, random_state=42)
user_stats['cluster'] = kmeans.fit_predict(user_stats)

# Map back to df
df = df.merge(user_stats['cluster'], left_on='user_id', right_index=True)


In [23]:
# Prepare features for ML
df['prev_ip_prefix'] = df.groupby('user_id')['ip_prefix'].shift(1)
df['prev2_ip_prefix'] = df.groupby('user_id')['ip_prefix'].shift(2)

df.dropna(inplace=True)

le = LabelEncoder()
df['ip_prefix_enc'] = le.fit_transform(df['ip_prefix'])
df['prev_ip_prefix_enc'] = le.fit_transform(df['prev_ip_prefix'])
df['prev2_ip_prefix_enc'] = le.fit_transform(df['prev2_ip_prefix'])

X = df[['prev2_ip_prefix_enc', 'prev_ip_prefix_enc', 'hour', 'weekday', 'amount', 'cluster']]
y = df['ip_prefix_enc']

# Train XGBoost
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
model.fit(X, y)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [24]:
def evaluate_predictions(df, model, le):
    X_test = df[['prev2_ip_prefix_enc', 'prev_ip_prefix_enc', 'hour', 'weekday', 'amount', 'cluster']]
    y_true = df['ip_prefix_enc']
    y_pred = model.predict(X_test)

    acc = (y_true == y_pred).mean()
    print(f'XGBoost Hybrid Model Accuracy: {acc:.4f}')

evaluate_predictions(df, model, le)


XGBoost Hybrid Model Accuracy: 0.9985


In [26]:
import pandas as pd
import numpy as np
from collections import defaultdict
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from datetime import datetime

# Load data
df = pd.read_csv('/content/user_transactions_july2025.csv')
df['timestamp'] = pd.to_datetime(df['timestamp'], format="%d-%m-%Y %H:%M")
df = df.sort_values(by=['user_id', 'timestamp'])

# Extract IP prefix and time features
df['ip_prefix'] = df['ip_address'].apply(lambda x: '.'.join(x.split('.')[:3]))
df['hour'] = df['timestamp'].dt.hour
df['weekday'] = df['timestamp'].dt.weekday

# Optional time bucket (not used here)
def time_bucket(hour):
    if 5 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 17:
        return 'Afternoon'
    elif 17 <= hour < 21:
        return 'Evening'
    else:
        return 'Night'

df['time_bucket'] = df['hour'].apply(time_bucket)

# First-order transitions
transitions = defaultdict(lambda: defaultdict(int))
for user_id, user_df in df.groupby('user_id'):
    ip_seq = user_df['ip_prefix'].tolist()
    for i in range(1, len(ip_seq)):
        transitions[ip_seq[i-1]][ip_seq[i]] += 1

probs = {
    from_ip: {to_ip: c / sum(to_dict.values()) for to_ip, c in to_dict.items()}
    for from_ip, to_dict in transitions.items()
}

# Second-order transitions
second_order_transitions = defaultdict(lambda: defaultdict(int))
for user_id, user_df in df.groupby('user_id'):
    ip_seq = user_df['ip_prefix'].tolist()
    for i in range(2, len(ip_seq)):
        second_order_transitions[(ip_seq[i-2], ip_seq[i-1])][ip_seq[i]] += 1

second_order_probs = {
    from_pair: {to_ip: c / sum(to_dict.values()) for to_ip, c in to_dict.items()}
    for from_pair, to_dict in second_order_transitions.items()
}

# User-level clustering
user_stats = df.groupby('user_id').agg({
    'ip_prefix': pd.Series.nunique,
    'amount': 'std',
    'ip_address': 'count'
}).fillna(0)

user_stats.columns = ['unique_ips', 'amount_std', 'num_txns']
kmeans = KMeans(n_clusters=3, random_state=42)
user_stats['cluster'] = kmeans.fit_predict(user_stats)
df = df.merge(user_stats['cluster'], left_on='user_id', right_index=True)

# Add shifted features
df['prev_ip_prefix'] = df.groupby('user_id')['ip_prefix'].shift(1)
df['prev2_ip_prefix'] = df.groupby('user_id')['ip_prefix'].shift(2)
df.dropna(inplace=True)

# Fit LabelEncoder on all IP prefixes
all_prefixes = pd.concat([
    df['ip_prefix'], df['prev_ip_prefix'], df['prev2_ip_prefix']
]).dropna().unique()

le = LabelEncoder()
le.fit(all_prefixes)

# Encode
df['ip_prefix_enc'] = le.transform(df['ip_prefix'])
df['prev_ip_prefix_enc'] = le.transform(df['prev_ip_prefix'])
df['prev2_ip_prefix_enc'] = le.transform(df['prev2_ip_prefix'])

# Add Markov transition probabilities
def get_first_order_prob(row):
    return probs.get(row['prev_ip_prefix'], {}).get(row['ip_prefix'], 0)

def get_second_order_prob(row):
    return second_order_probs.get((row['prev2_ip_prefix'], row['prev_ip_prefix']), {}).get(row['ip_prefix'], 0)

df['first_order_prob'] = df.apply(get_first_order_prob, axis=1)
df['second_order_prob'] = df.apply(get_second_order_prob, axis=1)

# Train XGBoost
X = df[['prev2_ip_prefix_enc', 'prev_ip_prefix_enc', 'hour', 'weekday', 'amount', 'cluster',
        'first_order_prob', 'second_order_prob']]
y = df['ip_prefix_enc']

model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
model.fit(X, y)

# ---------- Predict next IP prefix per user ----------
# Get last 2 transactions per user for prediction
latest_txns = df.sort_values(by='timestamp').groupby('user_id').tail(2)
latest = latest_txns.groupby('user_id').agg({
    'prev2_ip_prefix': 'last',
    'prev_ip_prefix': 'last',
    'hour': 'last',
    'weekday': 'last',
    'amount': 'last',
    'cluster': 'last'
}).reset_index()

# Drop rows where any of the required values is NaN
latest.dropna(inplace=True)

# Encode
latest['prev_ip_prefix_enc'] = le.transform(latest['prev_ip_prefix'])
latest['prev2_ip_prefix_enc'] = le.transform(latest['prev2_ip_prefix'])

# Markov probabilities (we don't have true current IP, so we'll predict based on most probable next IP)
def predict_first_order(row):
    return probs.get(row['prev_ip_prefix'], {})

def predict_second_order(row):
    return second_order_probs.get((row['prev2_ip_prefix'], row['prev_ip_prefix']), {})

latest['first_order_prob'] = 0  # Placeholder
latest['second_order_prob'] = 0  # Placeholder

# Construct X for prediction
X_pred = latest[['prev2_ip_prefix_enc', 'prev_ip_prefix_enc', 'hour', 'weekday',
                 'amount', 'cluster', 'first_order_prob', 'second_order_prob']]

# Predict encoded labels
y_pred_enc = model.predict(X_pred)

# Decode
latest['predicted_ip_prefix'] = le.inverse_transform(y_pred_enc)

# Output
final_preds = latest[['user_id', 'predicted_ip_prefix']]
print(final_preds)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


     user_id predicted_ip_prefix
0   user_001            64.1.152
1   user_002          199.245.22
2   user_003           42.83.205
3   user_004           51.165.50
4   user_005          176.133.13
..       ...                 ...
95  user_096          208.154.87
96  user_097           34.215.61
97  user_098           34.215.61
98  user_099           34.215.61
99  user_100           67.65.112

[100 rows x 2 columns]


In [27]:
# Load your actual CSV
ip_city_map = pd.read_csv('/content/indian_cities_with_ip.csv')   # adjust path if needed

# Extract IP prefix from full IP_Address
ip_city_map['ip_prefix'] = ip_city_map['IP_Address'].apply(lambda x: '.'.join(str(x).split('.')[:3]))

# Clean predicted IPs
final_preds['predicted_ip_prefix'] = final_preds['predicted_ip_prefix'].astype(str).str.strip()

# Merge on IP prefix
final_preds_with_city = final_preds.merge(ip_city_map[['City', 'ip_prefix']],
                                          left_on='predicted_ip_prefix',
                                          right_on='ip_prefix',
                                          how='left')

# Final output
final_output = final_preds_with_city[['user_id', 'predicted_ip_prefix', 'City']].rename(columns={'City': 'predicted_city'})
print(final_output)


     user_id predicted_ip_prefix      predicted_city
0   user_001            64.1.152            Bhusawal
1   user_002          199.245.22             Silchar
2   user_003           42.83.205  Thiruvananthapuram
3   user_004           51.165.50            Haridwar
4   user_005          176.133.13            Ambattur
..       ...                 ...                 ...
95  user_096          208.154.87              Indore
96  user_097           34.215.61               Ajmer
97  user_098           34.215.61               Ajmer
98  user_099           34.215.61               Ajmer
99  user_100           67.65.112           Kamarhati

[100 rows x 3 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_preds['predicted_ip_prefix'] = final_preds['predicted_ip_prefix'].astype(str).str.strip()
