In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,  RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, concatenate, Flatten

In [2]:
# Load the data
df = pd.read_excel(r"E:\CDAC 2024\Cdac Project\Typo_dataset_new.xlsx")
df_cp = df.copy()


In [3]:
# Function to extract IP features
def extract_ip_features(ip_list):
    if pd.isna(ip_list) or ip_list == ['!ServFail']:
        return [0, 0, 0, 0, 0, 0]
    else:
        ip_str = ip_list[0]
        if '.' in ip_str:
            octets = list(map(int, ip_str.split('.')))
            return octets + [1, 0]  # [octets..., is_ipv4, is_ipv6]
        elif ':' in ip_str:
            return [0, 0, 0, 0, 0, 1]  # [octets..., is_ipv4, is_ipv6]

# Apply the function to the 'typoip' column
df_cp['typoip'] = df_cp['typoip'].apply(lambda x: eval(x) if isinstance(x, str) else x)
df_cp[['ip1', 'ip2', 'ip3', 'ip4', 'is_ipv4', 'is_ipv6']] = pd.DataFrame(df_cp['typoip'].apply(extract_ip_features).tolist(), index=df_cp.index)


In [4]:
# Function to extract lexical features
def extract_lexical_features(domain):
    features = {
        'length': len(domain),
        'num_digits': sum(c.isdigit() for c in domain),
        'num_hyphens': domain.count('-'),
        'num_subdomains': domain.count('.'),
        'num_vowels': sum(c in 'aeiou' for c in domain),
        'num_consonants': sum(c in 'bcdfghjklmnpqrstvwxyz' for c in domain),
        'digit_to_length_ratio': sum(c.isdigit() for c in domain) / len(domain),
        'hyphen_to_length_ratio': domain.count('-') / len(domain)
    }
    return pd.Series(features)

# Apply the function to the 'typodomain' column
lexical_features = df_cp['typodomain'].apply(extract_lexical_features)
df_cp = pd.concat([df_cp, lexical_features], axis=1)

In [5]:
# Encode 'typotype' labels
label_encoder = LabelEncoder()
df_cp['typotype'] = label_encoder.fit_transform(df_cp['typotype'])


In [6]:
# Tokenize the domain names for LSTM input
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_cp['typodomain'])
sequences = tokenizer.texts_to_sequences(df_cp['typodomain'])
max_sequence_length = max(len(seq) for seq in sequences)
X_sequences = pad_sequences(sequences, maxlen=max_sequence_length)


In [7]:
# Define features (X) and target (y)
X_other_features = df_cp.drop(columns=['typodomain', 'typotype', 'typoip', 'typoip6'])
y = df_cp['typotype']

In [8]:
# Standardize the other features
scaler = StandardScaler()
X_other_features_scaled = scaler.fit_transform(X_other_features)

In [9]:
# Split data into training and testing sets
X_seq_train, X_seq_test, X_other_train, X_other_test, y_train, y_test = train_test_split(
    X_sequences, X_other_features_scaled, y, test_size=0.2, random_state=42
)

In [10]:
# Define the LSTM model
input_seq = Input(shape=(max_sequence_length,))
embedding_layer = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=50, input_length=max_sequence_length)(input_seq)
lstm_layer = LSTM(128)(embedding_layer)
lstm_output = Flatten()(lstm_layer)



In [11]:
# Define the dense layers for the other features
input_other = Input(shape=(X_other_features_scaled.shape[1],))
dense_layer = Dense(64, activation='relu')(input_other)

In [12]:
# Concatenate the outputs of LSTM and dense layers
concat_layer = concatenate([lstm_output, dense_layer])
dense_layer_2 = Dense(64, activation='relu')(concat_layer)
output_layer = Dense(len(np.unique(y)), activation='softmax')(dense_layer_2)


In [13]:
# Combine LSTM and dense model
combined_model = Model(inputs=[input_seq, input_other], outputs=output_layer)
combined_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [14]:
# Train the model
combined_model.fit([X_seq_train, X_other_train], y_train, epochs=10, batch_size=32, validation_split=0.2)


Epoch 1/10
[1m303/303[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 36ms/step - accuracy: 0.2171 - loss: 2.2245 - val_accuracy: 0.3566 - val_loss: 1.8670
Epoch 2/10
[1m303/303[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 30ms/step - accuracy: 0.4429 - loss: 1.6232 - val_accuracy: 0.5471 - val_loss: 1.3824
Epoch 3/10
[1m303/303[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 59ms/step - accuracy: 0.7918 - loss: 0.6623 - val_accuracy: 0.5756 - val_loss: 1.4745
Epoch 4/10
[1m303/303[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 43ms/step - accuracy: 0.9217 - loss: 0.2641 - val_accuracy: 0.5438 - val_loss: 1.9501
Epoch 5/10
[1m303/303[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 51ms/step - accuracy: 0.9664 - loss: 0.1254 - val_accuracy: 0.5211 - val_loss: 2.4054
Epoch 6/10
[1m303/303[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 37ms/step - accuracy: 0.9798 - loss: 0.0757 - val_accuracy: 0.5322 - val_loss: 2.3938
Epoch 7/10
[1m30

<keras.src.callbacks.history.History at 0x18e591ec3d0>

In [15]:
# Extract the LSTM features
lstm_features_model = Model(inputs=combined_model.input[0], outputs=combined_model.get_layer('flatten').output)
X_train_lstm_features = lstm_features_model.predict(X_seq_train)
X_test_lstm_features = lstm_features_model.predict(X_seq_test)

[1m379/379[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step


In [16]:
# Combine LSTM features with other features
X_train_combined = np.concatenate([X_train_lstm_features, X_other_train], axis=1)
X_test_combined = np.concatenate([X_test_lstm_features, X_other_test], axis=1)


In [17]:
# Initialize the XGBClassifier
xgb_clf = XGBClassifier(
    objective='multi:softmax',
    num_class=len(np.unique(y)),
    eval_metric='mlogloss',
    use_label_encoder=False
)

In [18]:
# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 4, 5, 6, 7],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0]
}

In [19]:
# Initialize RandomizedSearchCV for hyperparameter tuning
random_search = RandomizedSearchCV(
    estimator=xgb_clf,
    param_distributions=param_grid,
    n_iter=100,
    scoring='accuracy',
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

In [20]:
# Fit RandomizedSearchCV
random_search.fit(X_train_combined, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


Parameters: { "use_label_encoder" } are not used.



In [21]:
# Get the best model
best_model = random_search.best_estimator_

In [22]:
# Make predictions with the best model
preds = best_model.predict(X_test_combined)


In [23]:
# Evaluate accuracy
accuracy = accuracy_score(y_test, preds)
print(f'Best parameters: {random_search.best_params_}')
print(f'Accuracy: {accuracy}')

Best parameters: {'subsample': 0.9, 'n_estimators': 150, 'max_depth': 6, 'learning_rate': 0.05, 'colsample_bytree': 0.7}
Accuracy: 0.6271074380165289
