In [1]:
import pandas as pd
import numpy as np
import geoip2.database
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler

In [2]:
merged_df = pd.read_csv('/Users/sa12/Documents/Repositories/The-CyberChase/DATA/feb5-12_merged.csv')


In [11]:
reader = geoip2.database.Reader('/Users/sa12/Documents/Repositories/The-CyberChase/DATA/GeoLite2-City.mmdb')
# Build location info
locations = []

for ip in merged_df['Src IP']:
    try:
        response = reader.city(ip)
        country = response.country.name or "Unknown Country"
        city = response.city.name or "Unknown City"
        location_str = f"{country}"
    except Exception as e:
        location_str = f"Error: {e}"

    locations.append(location_str)

# Add to DataFrame
merged_df['Location'] = locations

In [18]:
count = merged_df['Location'].value_counts().to_frame()
count.head(20)

Unnamed: 0_level_0,count
Location,Unnamed: 1_level_1
Iran,533014
United States,331994
Hong Kong,190927
China,170692
Germany,87073
Japan,76265
Italy,47170
France,38434
Singapore,35055
United Kingdom,32750


In [19]:
country_counts = merged_df['Location'].value_counts(normalize=True).cumsum()

# Get the shortest list covering 90% of the data
top_countries = country_counts[country_counts <= 0.90].index.tolist()

# Include the next country if we're under 90% exactly
if country_counts.iloc[len(top_countries)] < 0.90:
    top_countries.append(country_counts.index[len(top_countries)])

print(top_countries)

['Iran', 'United States', 'Hong Kong', 'China', 'Germany', 'Japan', 'Italy', 'France', 'Singapore', 'United Kingdom']


In [None]:
data = pd.get_dummies(data, dtype='float')
data

In [None]:
# Features and target
X = merged_df.drop(columns='target')  # or specify feature columns explicitly
y = merged_df['target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
model = LogisticRegression(
    multi_class='multinomial',
    solver='lbfgs',
    max_iter=1000,
    random_state=42
)
model.fit(X_train_scaled, y_train)

In [None]:
y_pred = model.predict(X_test_scaled)

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))