In [217]:
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
from sklearn.preprocessing import LabelEncoder

In [218]:
df = pd.read_csv('train.csv')


In [219]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140404 entries, 0 to 140403
Data columns (total 56 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   id                          140404 non-null  int64  
 1   FILENAME                    82872 non-null   object 
 2   URL                         96917 non-null   object 
 3   URLLength                   79765 non-null   float64
 4   Domain                      70207 non-null   object 
 5   DomainLength                94085 non-null   float64
 6   IsDomainIP                  98274 non-null   float64
 7   TLD                         95005 non-null   object 
 8   CharContinuationRate        92362 non-null   float64
 9   TLDLegitimateProb           87531 non-null   float64
 10  URLCharProb                 88333 non-null   float64
 11  TLDLength                   92673 non-null   float64
 12  NoOfSubDomain               96344 non-null   float64
 13  HasObfuscation

In [220]:
from sklearn.model_selection import train_test_split
train_set, val_set = train_test_split(df, stratify=df['label'], test_size=0.2, random_state=42)

In [221]:
from sklearn.impute import SimpleImputer

att = ["IsDomainIP", "HasObfuscation", "IsHTTPS", "HasTitle", "HasFavicon", "Robots", "IsResponsive", 
       "HasDescription", "HasExternalFormSubmit", "HasSocialNet", "HasSubmitButton", "HasHiddenFields", 
       "HasPasswordField", "Pay", "Bank", "Crypto", "HasCopyrightInfo"]

# Inisialisasi imputer untuk kategori dan numerik
categorical_imputer = SimpleImputer(strategy='most_frequent')
numeric_imputer = SimpleImputer(strategy='mean')

# Mendapatkan daftar kolom numerik dan kategorikal
numeric_cols = train_set.select_dtypes(include=['float64', 'int64']).columns.tolist()
categorical_cols = train_set.select_dtypes(include=['object']).columns.tolist()

# Mendapatkan kolom numerik yang tidak ada dalam "att"
numeric_cols_beneran = [col for col in numeric_cols if col not in att]

# Imputasi untuk kolom-kolom dalam "att" (kategori)
train_set[att] = categorical_imputer.fit_transform(train_set[att])
val_set[att] = categorical_imputer.transform(val_set[att])

# Imputasi untuk kolom numerik yang tidak ada dalam "att"
train_set[numeric_cols_beneran] = numeric_imputer.fit_transform(train_set[numeric_cols_beneran])
val_set[numeric_cols_beneran] = numeric_imputer.transform(val_set[numeric_cols_beneran])

# Imputasi untuk kolom kategori umum
train_set[categorical_cols] = categorical_imputer.fit_transform(train_set[categorical_cols])
val_set[categorical_cols] = categorical_imputer.transform(val_set[categorical_cols])


In [222]:
train_set = train_set.drop(columns=train_set.select_dtypes(include='object').columns)
val_set = val_set.drop(columns=val_set.select_dtypes(include='object').columns)

In [223]:
def calculate_total_resources(X):
    total = X['NoOfImage'] + X['NoOfCSS'] + X['NoOfJS'] + X['NoOfSelfRef'] + X['NoOfEmptyRef'] + X['NoOfExternalRef']       
    X['resources'] = total
    return X
def feature_securing(data):
    # Pastikan kolom 'IsHTTPS' dan 'Robots' berupa integer
    data['IsHTTPS'] = data['IsHTTPS'].astype(int)
    data['Robots'] = data['Robots'].astype(int)

    # Tentukan risiko berdasarkan kondisi
    conditions = [
        (data['IsHTTPS'] == 1) & (data['Robots'] == 1),  
        (data['IsHTTPS'] == 1) & (data['Robots'] == 0), 
        (data['IsHTTPS'] == 0) & (data['Robots'] == 1),  
        (data['IsHTTPS'] == 0) & (data['Robots'] == 0), 
    ]
    risks = [3, 2, 1, 0]

    # Tambahkan kolom 'isHighRisk' berdasarkan kondisi
    data['isHighRisk'] = np.select(conditions, risks)

    return data
    
 
    
    return data
def create_phishing_features(data):
    Phishing_Score = (
        data['HasExternalFormSubmit'] + 
        data['HasPasswordField'] + 
        data['Bank'] + 
        data['Crypto']
    )
    Non_Phishing_Score = (
        data['IsResponsive'] + 
        data['HasDescription'] + 
        data['HasSocialNet'] + 
        data['HasCopyrightInfo']
    )
    data['Phising'] = Phishing_Score

    return data
import pandas as pd

def calculate_weighted_score(data: pd.DataFrame, domain_weight: float = 0.6, url_weight: float = 0.4) -> pd.DataFrame:
    data['WeightedScore'] = (
        data['DomainTitleMatchScore'] + 
        data['URLTitleMatchScore']
    )
    
    return data

def create_NoNphishing_features(data):
    Non_Phishing_Score = (
        data['IsResponsive'] + 
        data['HasDescription'] + 
        data['HasSocialNet'] + 
        data['HasCopyrightInfo']
    )
    data['Non_Phising'] = Non_Phishing_Score

    return data
def apply_feature_engineering(X: pd.DataFrame) -> pd.DataFrame:
    X = calculate_total_resources(X)
    X = feature_securing(X)
    X = create_NoNphishing_features(X)
    return X

In [224]:
train_set = apply_feature_engineering(train_set)
val_set = apply_feature_engineering(val_set)

In [225]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import PowerTransformer

def auto_transform_features(X_train: pd.DataFrame, X_test: pd.DataFrame, skew_threshold: float = 1.0, target_column: str = 'label') -> (pd.DataFrame, pd.DataFrame):
    transformer_dict = {}  # Store transformers for each column
    
    for col in X_train.columns:
        if col != target_column and np.issubdtype(X_train[col].dtype, np.number):  # Apply only to numeric columns
            skewness = X_train[col].skew()
            if abs(skewness) > skew_threshold:
                if skewness > 0:  
                    X_train[col] = np.log1p(X_train[col]) 
                    X_test[col] = np.log1p(X_test[col]) 
                else: 
                    transformer = PowerTransformer(method='yeo-johnson')
                    X_train[col] = transformer.fit_transform(X_train[col].values.reshape(-1, 1)).flatten()
                    X_test[col] = transformer.transform(X_test[col].values.reshape(-1, 1)).flatten()
                    transformer_dict[col] = transformer  
    
    return X_train, X_test

train_set, val_set = auto_transform_features(train_set.copy(), val_set.copy())


In [226]:
# import numpy as np
# import pandas as pd
# from sklearn.preprocessing import PowerTransformer

# def auto_transform_features(X: pd.DataFrame, skew_threshold: float = 1.0, target_column: str = 'label') -> pd.DataFrame:
#     transformer = PowerTransformer(method='yeo-johnson')  
#     for col in X.columns:
#         if col != target_column and np.issubdtype(X[col].dtype, np.number): 
#             skewness = X[col].skew()
#             if abs(skewness) > skew_threshold:
#                 if skewness > 0:  
#                     X[col] = np.log1p(X[col]) 
#                 else: 
#                     reshaped_data = X[col].values.reshape(-1, 1)  
#                     X[col] = transformer.fit_transform(reshaped_data).flatten()  

#     return X
# train_set = auto_transform_features(train_set.copy())
# val_set = auto_transform_features_test(val_set.copy())

In [227]:
# import numpy as np

# def feature_log_transformation(X: pd.DataFrame) -> pd.DataFrame:
#     columns_to_transform = [
#         'IsDomainIP', 'URLCharProb', 'NoOfLettersInURL', 'NoOfDegitsInURL',
#         'NoOfEqualsInURL', 'NoOfQMarkInURL', 'NoOfAmpersandInURL',
#         'LargestLineLength', 'NoOfImage', 'NoOfCSS', 'NoOfJS',
#         'NoOfSelfRef', 'NoOfEmptyRef', 'NoOfExternalRef',
#         'URLLength', 'NoOfiFrame', 'NoOfObfuscatedChar',
#         'HasObfuscation', 'ObfuscationRatio'
#     ]
#     X[columns_to_transform] = X[columns_to_transform].clip(lower=0.0001)
#     for col in columns_to_transform:
#         X[col] = np.log1p(X[col])
#     return X

# train_set = feature_log_transformation(train_set.copy())
# val_set = feature_log_transformation(val_set.copy())


In [228]:
# #Ini yang menyesuaikan korelasi jadi kayak ada dasarnta
# train_set=train_set.drop(columns=['id', 'IsDomainIP', 'TLDLength', 'NoOfSubDomain', 'HasObfuscation', 'NoOfObfuscatedChar', 'ObfuscationRatio', 'NoOfAmpersandInURL', 'NoOfURLRedirect', 'NoOfSelfRedirect', 'NoOfPopup', 'HasExternalFormSubmit', 'HasPasswordField', 'Bank', 'Crypto','TLDLegitimateProb','DomainLength','NoOfEqualsInURL','NoOfQMarkInURL'])
# val_set=val_set.drop(columns=['id', 'IsDomainIP', 'TLDLength', 'NoOfSubDomain', 'HasObfuscation', 'NoOfObfuscatedChar', 'ObfuscationRatio', 'NoOfAmpersandInURL', 'NoOfURLRedirect', 'NoOfSelfRedirect', 'NoOfPopup', 'HasExternalFormSubmit', 'HasPasswordField', 'Bank', 'Crypto','TLDLegitimateProb','DomainLength','NoOfEqualsInURL','NoOfQMarkInURL'])


In [229]:
# # ini kayak mcuman liat2 persebaran  doang tapi sebenarnya kayak ga terlqalu kuat dasatnya

# train_set=train_set.drop(columns=['id','DomainLength','NoOfOtherSpecialCharsInURL','IsHTTPS','HasSubmitButton','HasHiddenFields','Pay'])
# val_set=val_set.drop(columns=['id','DomainLength','NoOfOtherSpecialCharsInURL','IsHTTPS','HasSubmitButton','HasHiddenFields','Pay'])


In [230]:
train_set=train_set.drop(columns=['id'])
val_set=val_set.drop(columns=['id'])


In [231]:
from sklearn.preprocessing import RobustScaler

# Fungsi untuk memeriksa persentase outlier dengan metode IQR
def check_outliers_iqr(df, col):
    col_Q1 = df[col].quantile(0.25)
    col_Q3 = df[col].quantile(0.75)
    col_IQR = col_Q3 - col_Q1
    col_RLB = col_Q1 - 1.5 * col_IQR
    col_RUB = col_Q3 + 1.5 * col_IQR
    outliers = len(df.loc[(df[col] > col_RUB) | (df[col] < col_RLB)])
    return outliers / len(df) * 100
attributes = []
for col in train_set.columns:
    if col != 'label': 
        outlier_percent = check_outliers_iqr(train_set, col)
        if outlier_percent > 5:
            attributes.append(col)
scaler = RobustScaler()
train_set[attributes] = scaler.fit_transform(train_set[attributes])
val_set[attributes] = scaler.transform(val_set[attributes])


In [232]:
# from sklearn.preprocessing import RobustScaler

# attributes = [
#     "URLLength", 
#     "TLDLength", 
#     "NoOfSubDomain", 
#     "NoOfObfuscatedChar", 
#     "LineOfCode", 
#     "DomainTitleMatchScore"
# ]


# # Buat scaler
# scaler = RobustScaler()
# # 
# # Fit dan transform pada data latih (training data)
# train_set[attributes] = scaler.fit_transform(train_set[attributes])

# # Transform hanya pada data uji (test data) menggunakan scaler yang sama
# val_set[attributes] = scaler.transform(val_set[attributes])


In [233]:
from sklearn.model_selection import train_test_split
X = train_set.drop(columns=['label'])
y = train_set['label']
X_test = val_set.drop(columns=['label'])
y_test = val_set['label']

In [191]:
X.head()

Unnamed: 0,URLLength,DomainLength,IsDomainIP,CharContinuationRate,TLDLegitimateProb,URLCharProb,TLDLength,NoOfSubDomain,HasObfuscation,NoOfObfuscatedChar,...,HasCopyrightInfo,NoOfImage,NoOfCSS,NoOfJS,NoOfSelfRef,NoOfEmptyRef,NoOfExternalRef,resources,isHighRisk,Non_Phising
135080,-1.405199,0.0,0.0,-0.496327,0.522907,0.0,0.0,0.0,0.0,0.001731,...,1.084222,0.0,0.0,-2.086373,0.0,0.693147,0.0,0.150113,2,1.122275
97659,1.771505,0.0,0.0,0.916481,0.277364,-0.422143,0.0,0.0,0.0,0.0,...,-1.917982,0.0,-1.315018,0.0,0.016258,0.0,-1.470155,-0.218036,2,-0.186199
11488,0.0,0.0,0.0,0.916481,0.522907,1.488703,0.0,0.0,0.000368,0.001731,...,1.084222,0.0,0.0,1.94354,0.437933,1.562316,0.833565,0.908615,0,-0.557918
67815,0.0,1.870184,0.000433,-2.000532,0.079963,-0.4579,1.0,5.234087,0.000368,0.001731,...,-1.917982,-4.351337,-4.095825,-7.657875,-6.125708,1.562316,-5.214652,-6.235463,0,-2.011867
97148,-1.405199,-0.675408,0.0,-0.496327,0.522907,2.689586,1.0,0.0,0.000368,0.001731,...,1.084222,0.0,0.0,0.0,1.053563,1.562316,-1.113571,0.729245,0,1.021951


In [192]:
import pandas as pd
import numpy as np
# ini ngecek doang korelasi numerik dengan label trs yg hrsnyta di drop yg mana, tapi ketika di  dropmalah makin kecil...
def analyze_correlations(df, label_col, threshold=0.1):
    """
    Analyzes correlation of numerical features with the target label in a dataframe.
    
    Parameters:
        df (pd.DataFrame): Dataframe containing features and the label.
        label_col (str): Name of the column to be used as the label.
        threshold (float): Minimum absolute correlation value to retain a feature.
    
    Returns:
        pd.DataFrame: Summary of Pearson and Spearman correlations.
        list: Features to drop based on the threshold.
    """
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    numeric_features = numeric_cols.drop(label_col)
    pearson_corr = df[numeric_features].corrwith(df[label_col], method='pearson')
    spearman_corr = df[numeric_features].corrwith(df[label_col], method='spearman')
    correlation_summary = pd.DataFrame({
        'Feature': numeric_features,
        'Pearson': pearson_corr.values,
        'Spearman': spearman_corr.values
    }).set_index('Feature')

    correlation_summary['Drop'] = (correlation_summary['Pearson'].abs() < threshold) & \
                                   (correlation_summary['Spearman'].abs() < threshold)
    
    features_to_drop = correlation_summary[correlation_summary['Drop']].index.tolist()

    return correlation_summary, features_to_drop

# Example usage
# Assume `df` is a pandas DataFrame and `label` is the name of the label column
correlation_summary, features_to_drop = analyze_correlations(train_set, label_col='label', threshold=0.1)

print("Correlation Summary:")
print(correlation_summary)
print("\nFeatures to drop:")
print(features_to_drop)

Correlation Summary:
                             Pearson  Spearman   Drop
Feature                                              
URLLength                  -0.292266 -0.136526  False
DomainLength               -0.129887 -0.081645  False
IsDomainIP                 -0.061085 -0.005752   True
CharContinuationRate        0.216259  0.202332  False
TLDLegitimateProb           0.043519  0.060050   True
URLCharProb                 0.219950  0.175163  False
TLDLength                  -0.026208 -0.007737   True
NoOfSubDomain               0.058692 -0.010255   True
HasObfuscation             -0.049024  0.001917   True
NoOfObfuscatedChar         -0.039572 -0.000925   True
ObfuscationRatio           -0.041356  0.003902   True
NoOfLettersInURL           -0.235213 -0.148334  False
LetterRatioInURL           -0.169583 -0.115800  False
NoOfDegitsInURL            -0.399088 -0.194545  False
DegitRatioInURL            -0.389310 -0.207638  False
NoOfEqualsInURL            -0.147282 -0.020201  False
NoOfQMa

In [160]:
from sklearn.preprocessing import PowerTransformer


In [234]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
knn = KNeighborsClassifier(n_neighbors=3)
pca = PCA(n_components=21)
X_train_reduced = pca.fit_transform(X)
X_test_reduced = pca.transform(X_test)
knn.fit(X_train_reduced, y)
knn_pred = knn.predict(X_test_reduced)

In [235]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, cohen_kappa_score, balanced_accuracy_score, classification_report,roc_auc_score, roc_curve
print(f'Accuracy: {accuracy_score(y_test, knn_pred)}')
print(f'F1 Score: {f1_score(y_test, knn_pred, average="weighted")}')
print(f'Recall Score: {recall_score(y_test, knn_pred, average="weighted")}')
print(f"Balanced Accuracy: {balanced_accuracy_score(y_test, knn_pred)}")
print(f"ROC-AUC: {roc_auc_score(y_test, knn_pred)}")
print("KNeighbors Classifier")
print(classification_report(y_test, knn_pred))
print()

Accuracy: 0.9909903493465333
F1 Score: 0.9907517702694255
Recall Score: 0.9909903493465333
Balanced Accuracy: 0.942687012507782
ROC-AUC: 0.9426870125077819
KNeighbors Classifier
              precision    recall  f1-score   support

         0.0       0.99      0.89      0.94      2111
         1.0       0.99      1.00      1.00     25970

    accuracy                           0.99     28081
   macro avg       0.99      0.94      0.97     28081
weighted avg       0.99      0.99      0.99     28081




In [236]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, cohen_kappa_score, balanced_accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN,SMOTETomek
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
X = train_set.drop(columns=['label'])
y = train_set['label']
X_test = val_set.drop(columns=['label'])
y_test = val_set['label']
pca = PCA(n_components=21)
X_train_reduced = pca.fit_transform(X)
X_test_reduced = pca.transform(X_test)
smote_enn = SMOTETomek(random_state=42)
X_smote, y_smote = smote_enn.fit_resample(X_train_reduced, y)
knn = KNeighborsClassifier(n_neighbors=3, weights='distance')
knn.fit(X_smote, y_smote)
knn_pred = knn.predict(X_test_reduced)
print(f'Accuracy: {accuracy_score(y_test, knn_pred)}')
print(f'F1 Score: {f1_score(y_test, knn_pred, average="weighted")}')
print(f'Recall Score: {recall_score(y_test, knn_pred, average="weighted")}')
print(f"Balanced Accuracy: {balanced_accuracy_score(y_test, knn_pred)}")
print("KNeighbors Classifier")
print(classification_report(y_test, knn_pred))

Accuracy: 0.9907054592072931
F1 Score: 0.9907798525993772
Recall Score: 0.9907054592072931
Balanced Accuracy: 0.9747380235220211
KNeighbors Classifier
              precision    recall  f1-score   support

         0.0       0.92      0.96      0.94      2111
         1.0       1.00      0.99      0.99     25970

    accuracy                           0.99     28081
   macro avg       0.96      0.97      0.97     28081
weighted avg       0.99      0.99      0.99     28081



In [254]:
# Elemen Frekuensi dari data dalam sebuah Atribut
for c in train_set.columns:
    print(f"Value counts for {c}:")
    print(train_set[c].value_counts())
    print("\n")

Value counts for URLLength:
URLLength
 0.000000     48518
-1.000000      5109
-0.610095      4990
-1.405199      4987
-0.234371      4711
              ...  
 19.438803        1
 20.026268        1
 21.700100        1
 27.004030        1
 20.078054        1
Name: count, Length: 199, dtype: int64


Value counts for IsDomainIP:
IsDomainIP
0.000000    78554
0.000433    33735
0.693147       34
Name: count, dtype: int64


Value counts for CharContinuationRate:
CharContinuationRate
-1.000000    55314
 0.000000    38358
 2.656568     1439
 3.533498      928
 1.836254      880
             ...  
 5.960744        1
 6.448945        1
 4.810417        1
 5.972518        1
 1.415539        1
Name: count, Length: 433, dtype: int64


Value counts for TLDLegitimateProb:
TLDLegitimateProb
0.277364    42296
0.522907    35246
0.079963     8048
0.028555     2952
0.038420     2045
            ...  
0.000053        1
0.000040        1
0.000004        1
0.000024        1
0.000042        1
Name: count, Leng

In [195]:
def check_outliers_iqr(df,col):
    col_Q1 = df[col].quantile(0.25)
    col_Q3 = df[col].quantile(0.75)
    col_IQR = col_Q3 - col_Q1
    col_RLB = col_Q1 - 1.5 * col_IQR
    col_RUB = col_Q3 + 1.5 * col_IQR
    outliers = len(df.loc[(df[col]>col_RUB) | (df[col]<col_RLB)])
    return outliers/len(df)*100
for i in train_set.columns:
    print(f'Outliers for {i}: {check_outliers_iqr(train_set,i)}%')
    print(f'Data skew for {i}: {train_set[i].skew()}')
    print("-"*50)
    print(f'Outliers for {i}: {check_outliers_iqr(val_set,i)}%')
    print(f'Data skew for {i}: {val_set[i].skew()}')
    print("-"*50)

Outliers for URLLength: 16.884342476607642%
Data skew for URLLength: 3.203197035097181
--------------------------------------------------
Outliers for URLLength: 17.057797086998328%
Data skew for URLLength: 3.7179684650120364
--------------------------------------------------
Outliers for DomainLength: 8.186212975080794%
Data skew for DomainLength: 0.09656407637068214
--------------------------------------------------
Outliers for DomainLength: 8.112246714860582%
Data skew for DomainLength: 0.0481379064403337
--------------------------------------------------
Outliers for IsDomainIP: 0.03026984678115791%
Data skew for IsDomainIP: 57.42846038610174
--------------------------------------------------
Outliers for IsDomainIP: 0.04985577436700973%
Data skew for IsDomainIP: 44.743886642286895
--------------------------------------------------
Outliers for CharContinuationRate: 0.0%
Data skew for CharContinuationRate: -0.5844332124382617
--------------------------------------------------
Outl

In [256]:
print(train_set.describe())

           URLLength     IsDomainIP  CharContinuationRate  TLDLegitimateProb  \
count  112323.000000  112323.000000         112323.000000      112323.000000   
mean       -0.233137       0.000340             -0.077854           0.277364   
std         1.872300       0.012057              1.356453           0.195896   
min        -6.682672       0.000000             -1.000000           0.000000   
25%        -1.000000       0.000000             -1.000000           0.079963   
50%         0.000000       0.000000              0.000000           0.277364   
75%         0.000000       0.000433              0.000000           0.522907   
max        49.571165       0.693147              7.810210           0.522907   

         URLCharProb      TLDLength  NoOfSubDomain  HasObfuscation  \
count  112323.000000  112323.000000  112323.000000   112323.000000   
mean       -0.002971      -0.094953       0.844135        0.000308   
std         1.914276       1.652984       1.941196        0.009699   

In [257]:
# Probabilitas untuk IsHTTPs terhadap label
prob_ishttps = train_set.groupby(['IsHTTPS', 'label']).size() / train_set.groupby(['label']).size()
print(prob_ishttps)

# Probabilitas untuk robots terhadap label
prob_robots = train_set.groupby(['Robots', 'label']).size() / train_set.groupby(['label']).size()
print(prob_robots)


KeyError: 'IsHTTPS'

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import entropy

# Memuat dataset (ganti 'your_dataset.csv' dengan file Anda)
data = pd.read_csv('your_dataset.csv')

# 1. Entropy of the URL (URLEntropy)
def calculate_entropy(text):
    if not isinstance(text, str) or len(text) == 0:
        return 0
    char_count = np.array([text.count(char) for char in set(text)])
    probabilities = char_count / len(text)
    return entropy(probabilities, base=2)

data['URLEntropy'] = data['URL'].apply(calculate_entropy)

# 2. Proportion of Numeric Characters in Domain (DomainDigitRatio)
def digit_ratio(domain):
    if not isinstance(domain, str) or len(domain) == 0:
        return 0
    return sum(char.isdigit() for char in domain) / len(domain)

data['DomainDigitRatio'] = data['Domain'].apply(digit_ratio)

# 3. Proportion of Special Characters in Domain (DomainSpecialCharRatio)
special_chars = set("-_.@")
def special_char_ratio(domain):
    if not isinstance(domain, str) or len(domain) == 0:
        return 0
    return sum(char in special_chars for char in domain) / len(domain)

data['DomainSpecialCharRatio'] = data['Domain'].apply(special_char_ratio)

# 4. Length of Subdomain (SubdomainLength)
def subdomain_length(url):
    if not isinstance(url, str) or len(url) == 0:
        return 0
    parts = url.split('.')
    if len(parts) > 2:
        return len(parts[-3])  # Asumsikan subdomain adalah bagian ketiga dari belakang
    return 0

data['SubdomainLength'] = data['URL'].apply(subdomain_length)

# 5. Suspicious Keyword Presence (HasSuspiciousKeyword)
suspicious_keywords = ['free', 'login', 'secure', 'verify', 'update']
def contains_suspicious_keyword(url):
    if not isinstance(url, str):
        return 0
    for keyword in suspicious_keywords:
        if keyword in url.lower():
            return 1
    return 0

data['HasSuspiciousKeyword'] = data['URL'].apply(contains_suspicious_keyword)

# 6. Redirection Depth (RedirectionDepth)
def count_redirections(url):
    if not isinstance(url, str):
        return 0
    return url.count('http') - 1  # Asumsikan "http" menunjukkan pengalihan

data['RedirectionDepth'] = data['URL'].apply(count_redirections)

# 7. Proportion of Uppercase Characters in URL (URLUppercaseRatio)
def uppercase_ratio(url):
    if not isinstance(url, str) or len(url) == 0:
        return 0
    return sum(char.isupper() for char in url) / len(url)

data['URLUppercaseRatio'] = data['URL'].apply(uppercase_ratio)

# 8. Similarity to Known Domains (DomainSimilarity)
known_domains = ['google.com', 'facebook.com', 'twitter.com']
def domain_similarity(domain):
    if not isinstance(domain, str):
        return 0
    similarities = [cosine_similarity(
        CountVectorizer().fit_transform([domain, known_domain]).toarray()
    )[0, 1] for known_domain in known_domains]
    return max(similarities)

data['DomainSimilarity'] = data['Domain'].apply(domain_similarity)

# 9. Frequency of Obfuscated Characters (ObfuscatedCharCount)
obfuscated_chars = set('%&?')
def count_obfuscated_chars(url):
    if not isinstance(url, str):
        return 0
    return sum(char in obfuscated_chars for char in url)

data['ObfuscatedCharCount'] = data['URL'].apply(count_obfuscated_chars)

# 10. URL Depth

def url_depth(url):
    if not isinstance(url, str):
        return 0
    return url.count('/') - 2  # Asumsikan "//" untuk protocol diabaikan

data['URLDepth'] = data['URL'].apply(url_depth)

# Simpan dataset dengan fitur baru
data.to_csv('enhanced_dataset.csv', index=False)

print("Feature engineering selesai! Dataset telah disimpan ke 'enhanced_dataset.csv'.")
