In [3]:
import pandas as pd


In [7]:
df = pd.read_csv('../data/synthetic_ipdrs.csv')

In [8]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   SrcIP       2000 non-null   object
 1   DstIP       2000 non-null   object
 2   SrcPort     2000 non-null   int64 
 3   DstPort     2000 non-null   int64 
 4   Timestamp   2000 non-null   object
 5   Protocol    2000 non-null   object
 6   DataVolume  2000 non-null   int64 
 7   FraudType   1304 non-null   object
dtypes: int64(3), object(5)
memory usage: 125.1+ KB


In [9]:
# Convert the 'Timestamp' column from object to datetime
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

# Verify the change
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   SrcIP       2000 non-null   object        
 1   DstIP       2000 non-null   object        
 2   SrcPort     2000 non-null   int64         
 3   DstPort     2000 non-null   int64         
 4   Timestamp   2000 non-null   datetime64[ns]
 5   Protocol    2000 non-null   object        
 6   DataVolume  2000 non-null   int64         
 7   FraudType   1304 non-null   object        
dtypes: datetime64[ns](1), int64(3), object(4)
memory usage: 125.1+ KB


In [10]:
df.head()


Unnamed: 0,SrcIP,DstIP,SrcPort,DstPort,Timestamp,Protocol,DataVolume,FraudType
0,192.168.63.192,10.0.201.1,5385,443,2023-11-12 08:18:22,UDP,8305754,DDoS
1,192.168.141.215,10.0.101.136,4365,53,2023-09-01 22:26:05,UDP,9987103,
2,192.168.243.223,10.0.198.160,8851,443,2023-03-12 08:01:06,TCP,6062294,
3,192.168.21.35,10.0.142.55,36969,22,2023-04-15 05:48:09,TCP,1608877,Espionage
4,192.168.216.104,10.0.20.30,1669,80,2023-05-12 02:42:53,UDP,4767985,DDoS


In [11]:
# Create the 'is_fraud' column. 1 if FraudType is not null, 0 if it is.
df['is_fraud'] = df['FraudType'].notna().astype(int)

# You can also keep the original labels by filling the nulls
df['FraudType'] = df['FraudType'].fillna('Normal')

# Verify the new columns and their distribution
print("Fraud vs. Normal count:")
print(df['is_fraud'].value_counts())

print("\nFraud types breakdown:")
print(df['FraudType'].value_counts())

Fraud vs. Normal count:
is_fraud
1    1304
0     696
Name: count, dtype: int64

Fraud types breakdown:
FraudType
Normal       696
DDoS         667
Espionage    637
Name: count, dtype: int64


In [12]:
# Extract the hour of the day from the Timestamp
df['hour_of_day'] = df['Timestamp'].dt.hour

# You could also extract the day of the week if you had a larger dataset
# df['day_of_week'] = df['Timestamp'].dt.dayofweek

print(df[['Timestamp', 'hour_of_day']].head())

            Timestamp  hour_of_day
0 2023-11-12 08:18:22            8
1 2023-09-01 22:26:05           22
2 2023-03-12 08:01:06            8
3 2023-04-15 05:48:09            5
4 2023-05-12 02:42:53            2


In [13]:
# Create aggregate features based on the source IP
ip_features = df.groupby('SrcIP').agg(
    connection_count=('DstIP', 'count'),          # How many connections from this IP?
    avg_data_volume=('DataVolume', 'mean'),      # Average data sent by this IP
    port_diversity=('DstPort', 'nunique')        # How many unique ports does this IP contact?
).reset_index()

# Merge these new features back into the original dataframe
df = pd.merge(df, ip_features, on='SrcIP', how='left')

print(df[['SrcIP', 'connection_count', 'avg_data_volume', 'port_diversity']].head())

             SrcIP  connection_count  avg_data_volume  port_diversity
0   192.168.63.192                 1        8305754.0               1
1  192.168.141.215                 1        9987103.0               1
2  192.168.243.223                 1        6062294.0               1
3    192.168.21.35                 1        1608877.0               1
4  192.168.216.104                 1        4767985.0               1


In [14]:
# Convert 'Protocol' using one-hot encoding
df = pd.get_dummies(df, columns=['Protocol'], prefix='proto')

print(df.head())

             SrcIP         DstIP  SrcPort  DstPort           Timestamp  \
0   192.168.63.192    10.0.201.1     5385      443 2023-11-12 08:18:22   
1  192.168.141.215  10.0.101.136     4365       53 2023-09-01 22:26:05   
2  192.168.243.223  10.0.198.160     8851      443 2023-03-12 08:01:06   
3    192.168.21.35   10.0.142.55    36969       22 2023-04-15 05:48:09   
4  192.168.216.104    10.0.20.30     1669       80 2023-05-12 02:42:53   

   DataVolume  FraudType  is_fraud  hour_of_day  connection_count  \
0     8305754       DDoS         1            8                 1   
1     9987103     Normal         0           22                 1   
2     6062294     Normal         0            8                 1   
3     1608877  Espionage         1            5                 1   
4     4767985       DDoS         1            2                 1   

   avg_data_volume  port_diversity  proto_TCP  proto_UDP  
0        8305754.0               1      False       True  
1        9987103.0    

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# 1. Define your features (X) and target (y)
# Drop non-numeric or original columns we no longer need
X = df.drop(columns=['SrcIP', 'DstIP', 'Timestamp', 'FraudType', 'is_fraud'])
y = df['is_fraud']

# 2. Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# 3. Apply SMOTE to balance the training data
# Note: The 'Normal' class is the minority, so we will up-sample it.
print("Before SMOTE:", y_train.value_counts())
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
print("After SMOTE:", y_train_resampled.value_counts())

# 4. Scale the numerical features
# Fit the scaler ONLY on the training data to prevent data leakage
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test) # Use the same scaler to transform the test data

Before SMOTE: is_fraud
1    913
0    487
Name: count, dtype: int64
After SMOTE: is_fraud
1    913
0    913
Name: count, dtype: int64
