In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('urldataset.csv')

df.head()

Unnamed: 0,index,having_IPhaving_IP_Address,URLURL_Length,Shortining_Service,having_At_Symbol,double_slash_redirecting,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,...,popUpWidnow,Iframe,age_of_domain,DNSRecord,web_traffic,Page_Rank,Google_Index,Links_pointing_to_page,Statistical_report,Result
0,1,-1,1,1,1,-1,-1,-1,-1,-1,...,1,1,-1,-1,-1,-1,1,1,-1,-1
1,2,1,1,1,1,1,-1,0,1,-1,...,1,1,-1,-1,0,-1,1,1,1,-1
2,3,1,0,1,1,1,-1,-1,-1,-1,...,1,1,1,-1,1,-1,1,0,-1,-1
3,4,1,0,1,1,1,-1,-1,-1,1,...,1,1,-1,-1,1,-1,1,-1,1,-1
4,5,1,0,-1,1,1,-1,1,1,-1,...,-1,1,-1,-1,0,-1,1,1,1,1


In [3]:
df = df.drop(columns=['index'])

In [4]:
report_data = []
for column in df.columns:
        # Initialize a dictionary to hold the data for each column
    column_report = {}

        # Check missing values
    missing_values = df[column].isnull().sum()
    column_report['Missing Values'] = missing_values
    column_report['Missing Percentage'] = (missing_values / len(df)) * 100

        # Check data type
    column_report['Data Type'] = df[column].dtype

        # Check unique values
    unique_values = df[column].unique()
    column_report['Unique Values'] = ', '.join(map(str, unique_values))
    column_report['Unique Count'] = len(unique_values)

        # Check if there are only expected values (1, -1, 0)
    valid_values = {1, -1, 0}
    invalid_values = set(unique_values) - valid_values
    column_report['Invalid Values'] = ', '.join(map(str, invalid_values))

        # Check frequency distribution of values
    value_counts = df[column].value_counts()
    column_report['Value Counts'] = ', '.join([f"{k}: {v}" for k, v in value_counts.items()])

        # Check if all rows are valid (if there are only 1, -1, 0)
    invalid_rows = df[~df[column].isin(valid_values)]
    column_report['Invalid Rows'] = len(invalid_rows)

        # Append the column report to the list
    report_data.append(column_report)

    # Convert the report into a DataFrame for a better display
report_df = pd.DataFrame(report_data, index=df.columns)
report_df

Unnamed: 0,Missing Values,Missing Percentage,Data Type,Unique Values,Unique Count,Invalid Values,Value Counts,Invalid Rows
having_IPhaving_IP_Address,0,0.0,int64,"-1, 1",2,,"1: 7262, -1: 3793",0
URLURL_Length,0,0.0,int64,"1, 0, -1",3,,"-1: 8960, 1: 1960, 0: 135",0
Shortining_Service,0,0.0,int64,"1, -1",2,,"1: 9611, -1: 1444",0
having_At_Symbol,0,0.0,int64,"1, -1",2,,"1: 9400, -1: 1655",0
double_slash_redirecting,0,0.0,int64,"-1, 1",2,,"1: 9626, -1: 1429",0
Prefix_Suffix,0,0.0,int64,"-1, 1",2,,"-1: 9590, 1: 1465",0
having_Sub_Domain,0,0.0,int64,"-1, 0, 1",3,,"1: 4070, 0: 3622, -1: 3363",0
SSLfinal_State,0,0.0,int64,"-1, 1, 0",3,,"1: 6331, -1: 3557, 0: 1167",0
Domain_registeration_length,0,0.0,int64,"-1, 1",2,,"-1: 7389, 1: 3666",0
Favicon,0,0.0,int64,"1, -1",2,,"1: 9002, -1: 2053",0


In [10]:
X = df.drop('Result', axis=1)
y = df['Result']
print(df['Result'].value_counts())

Result
 1    6157
-1    4898
Name: count, dtype: int64


In [14]:
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 3-fold cross-validation
kf = KFold(n_splits=3, shuffle=True, random_state=42)

# Initialize the model
model = RandomForestClassifier()
clf = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=42)

# Lists to store results
accuracies = []
precisions = []
recalls = []
f1_scores = []

# K-fold cross-validation
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Train the model (Random Forest in this case)
    model.fit(X_train, y_train)
    y_pred_rf = model.predict(X_test)

    # Calculate and store metrics
    accuracies.append(accuracy_score(y_test, y_pred_rf))
    precisions.append(precision_score(y_test, y_pred_rf, average='weighted', zero_division=0))
    recalls.append(recall_score(y_test, y_pred_rf, average='weighted', zero_division=0))
    f1_scores.append(f1_score(y_test, y_pred_rf, average='weighted', zero_division=0))

# Output average metrics
print(f"Average Accuracy: {sum(accuracies)/len(accuracies):.4f}")
print(f"Average Precision: {sum(precisions)/len(precisions):.4f}")
print(f"Average Recall: {sum(recalls)/len(recalls):.4f}")
print(f"Average F1 Score: {sum(f1_scores)/len(f1_scores):.4f}")


Average Accuracy: 0.9681
Average Precision: 0.9682
Average Recall: 0.9681
Average F1 Score: 0.9680


In [15]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

clf = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=42)

# Accuracy
accuracy_scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy')
print("Accuracy for each fold:", accuracy_scores)
print("Average Accuracy:", np.mean(accuracy_scores))

# Precision
precision_scores = cross_val_score(clf, X, y, cv=5, scoring='precision_weighted')
print("\nPrecision for each fold:", precision_scores)
print("Average Precision:", np.mean(precision_scores))

# Recall
recall_scores = cross_val_score(clf, X, y, cv=5, scoring='recall_weighted')
print("\nRecall for each fold:", recall_scores)
print("Average Recall:", np.mean(recall_scores))

# F1 Score
f1_scores = cross_val_score(clf, X, y, cv=5, scoring='f1_weighted')
print("\nF1 Score for each fold:", f1_scores)
print("Average F1 Score:", np.mean(f1_scores))


Accuracy for each fold: [0.88647671 0.90456807 0.92446857 0.90321122 0.91677974]
Average Accuracy: 0.9071008593396653

Precision for each fold: [0.89738076 0.91144494 0.92462976 0.90350124 0.9174092 ]
Average Precision: 0.9108731790073744

Recall for each fold: [0.88647671 0.90456807 0.92446857 0.90321122 0.91677974]
Average Recall: 0.9071008593396653

F1 Score for each fold: [0.88433944 0.9032967  0.92451693 0.90329592 0.91690134]
Average F1 Score: 0.9064700687018066


In [16]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import KFold
import numpy as np

knn = KNeighborsClassifier(n_neighbors=3)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

accuracies = []
precisions = []
recalls = []
f1_scores = []

# K-Fold Cross-Validation
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Train the model
    knn.fit(X_train, y_train)

    # Make predictions
    y_pred = knn.predict(X_test)

    # Evaluate the model
    accuracies.append(accuracy_score(y_test, y_pred))
    precisions.append(precision_score(y_test, y_pred, average='weighted', zero_division=0))
    recalls.append(recall_score(y_test, y_pred, average='weighted', zero_division=0))
    f1_scores.append(f1_score(y_test, y_pred, average='weighted', zero_division=0))

# Print the metrics for each fold
print("Accuracy for each fold:", accuracies)
print("Precision for each fold:", precisions)
print("Recall for each fold:", recalls)
print("F1 Score for each fold:", f1_scores)

# Print the average of each metric
print("\nAverage Accuracy:", np.mean(accuracies))
print("Average Precision:", np.mean(precisions))
print("Average Recall:", np.mean(recalls))
print("Average F1 Score:", np.mean(f1_scores))


Accuracy for each fold: [0.9457259158751696, 0.9525101763907734, 0.9534147444595206, 0.945273631840796, 0.9448213478064225]
Precision for each fold: [0.9457779303527692, 0.9524854886916426, 0.9534708425414538, 0.9452874090879486, 0.9448445776320572]
Recall for each fold: [0.9457259158751696, 0.9525101763907734, 0.9534147444595206, 0.945273631840796, 0.9448213478064225]
F1 Score for each fold: [0.9456379736061019, 0.9524880950076432, 0.9533715216476365, 0.9452238533274775, 0.9448298516056691]

Average Accuracy: 0.9483491632745364
Average Precision: 0.9483732496611743
Average Recall: 0.9483491632745364
Average F1 Score: 0.9483102590389058
