In [38]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    classification_report, 
    confusion_matrix, 
    roc_curve, 
    roc_auc_score,
    precision_recall_curve,
    average_precision_score
)

In [None]:
url = './sample_data/data.csv'
url_csv =  pd.read_csv(url)
url_csv.tail(10)

In [None]:
url_df = pd.DataFrame(url_csv)
url_df = np.array(url_df)  

label =  [d[1] for d in url_df]
urls = [d[0] for d in url_df]

In [46]:
def sanitization(web):
    web = web.lower()
    token = []
    dot_token_slash = []
    raw_slash = str(web).split('/')
    for i in raw_slash:
        # removing slash to get token
        raw1 = str(i).split('-')
        slash_token = []
        for j in range(0,len(raw1)):
            # removing dot to get the tokens
            raw2 = str(raw1[j]).split('.')
            slash_token = slash_token + raw2
        dot_token_slash = dot_token_slash + raw1 + slash_token
    # to remove same words
    token = list(set(dot_token_slash))  
    if 'com' in token:
        #remove com
        token.remove('com')
    return token


In [None]:
vectorizer = TfidfVectorizer(tokenizer=sanitization)
x = vectorizer.fit_transform(urls)
x_train, x_test, y_train, y_test = train_test_split(x, label, test_size=0.2, random_state=42)

# Model training
lgr = LogisticRegression(solver='lbfgs', max_iter=1000)
lgr.fit(x_train, y_train)
score = lgr.score(x_test, y_test)
print("Accuracy: {0:.2f} %".format(100 * score))

# Predictions for evaluation
y_pred = lgr.predict(x_test)
y_pred_proba = lgr.predict_proba(x_test)[:, 1]  # probabilities for positive class

# 1. Classification Report (Precision, Recall, F1-Score)
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Good URL', 'Bad URL']))

# 2. Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Good URL', 'Bad URL'], 
            yticklabels=['Good URL', 'Bad URL'])
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

# 3. ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = roc_auc_score(y_test, y_pred_proba)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

"""
# 4. Precision-Recall Curve
precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
average_precision = average_precision_score(y_test, y_pred_proba)

plt.figure(figsize=(8, 6))
plt.plot(recall, precision, color='blue', lw=2, ve
         label=f'Precision-Recall curve (AP = {average_precision:.2f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc="upper right")
plt.show()

# 5. Top 15 Important Features for Bad URLs
feature_names = vectorizer.get_feature_names_out()
coefs = lgr.coef_[0]
top_bad_features = sorted(zip(feature_names, coefs), key=lambda x: x[1], reverse=True)[:15]

plt.figure(figsize=(10, 6))
plt.barh([x[0] for x in top_bad_features], [x[1] for x in top_bad_features], color='red')
plt.xlabel('Coefficient Value')
plt.title('Top 15 Important Features for Bad URLs')
plt.gca().invert_yaxis()  # highest coefficient at top
plt.tight_layout()
plt.show()
"""
vectorizer_save = vectorizer