In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import plot_tree
import re

# Load Data
def load_data(file_path):
    """
    Load the dataset from a CSV file and return a DataFrame.

    Args:
    file_path (str): The path to the CSV file.

    Returns:
    pd.DataFrame: The loaded dataset.
    """
    df = pd.read_csv(file_path)
    return df

# Data Preprocessing and Feature Extraction
def preprocess_and_extract_features(data, top_n_terms):
    """
    Preprocess the data and extract features.

    Args:
    data (pd.DataFrame): The input dataset containing 'url' and 'status' columns.
    top_n_terms (int): The number of top terms to extract.

    Returns:
    pd.DataFrame: The feature matrix.
    """
    # Balance the dataset (your code for balancing the data)

    # Tokenize URLs and extract top terms
    vocab = top_terms(data, top_n_terms)

    CORPUS = [to_txt(url) for url in data.url]

    vectorizer = CountVectorizer(binary=True, vocabulary=vocab)
    docTermMatrix = vectorizer.fit_transform(CORPUS)


    # Transform URLs into a feature matrix
    feature_matrix = create_feature_matrix(data, docTermMatrix, vocab, CORPUS)

    return feature_matrix

def top_terms(data, top_n_terms):
    """
    Extract the top 'n' terms based on their frequency in the dataset.

    Args:
    data (pd.DataFrame): The dataset with 'url' and 'status' columns.
    top_n_terms (int): The number of top terms to extract.

    Returns:
    list: The list of top terms.
    """
    term_frequency = {}
    for url, status in data.values:
        for word in tokenize_url(url):
            if word != '':
                if word not in term_frequency:
                    term_frequency[word] = 0
                term_frequency[word] += 1
            
    top_terms = [term for term, _ in sorted(term_frequency.items(), key=lambda x: x[1], reverse=True)[:top_n_terms]]
    return top_terms

def tokenize_url(url):
    """
    Tokenize a URL by replacing slashes with dots and splitting on dots.

    Args:
    url (str): The URL to tokenize.

    Returns:
    list: A list of terms (words).
    """
    return url.replace('/', '.').split('.')


def to_txt(text) -> str:
    return text.replace('.', ' ').replace('/', ' ')

def num_digits(text) -> int:
    return len(re.findall('\d', text))

def num_dots(text) -> int:
    return len(re.findall('\.', text))

def num_bar(text) -> int:
    return len(re.findall('/', text))




def create_feature_matrix(df_balanced,docTermMatrix, VOC, CORPUS):
    """
    Create a feature matrix based on the provided vocabulary.

    Args:
    data (pd.DataFrame): The dataset with 'url' column.
    vocabulary (list): The list of terms to use as features.

    Returns:
    pd.DataFrame: The feature matrix.
    """
    # Additional preprocessing (your code for feature extraction)

  

    matrix = pd.DataFrame(docTermMatrix.A, columns=VOC)
    matrix['dots'] = [num_dots(text) for text in df_balanced.url]
    matrix['bar'] = [num_bar(text) for text in df_balanced.url]
    matrix['len'] = [len(text) for text in CORPUS]
    matrix['digits'] = [num_digits(text) for text in CORPUS]


    return matrix

def create_feature(data, vocabulary):
    """
    Create a feature matrix based on the provided vocabulary.

    Args:
    data (pd.DataFrame): The dataset with 'url' column.
    vocabulary (list): The list of terms to use as features.

    Returns:
    pd.DataFrame: The feature matrix.
    """
    # Tokenize URLs in the 'url' column
    data['url'] = data['url'].apply(tokenize_url)

    # Join the tokenized terms into a single string for each URL
    data['url'] = data['url'].apply(lambda terms: ' '.join(terms))

    # Create a CountVectorizer with binary=True to represent the presence or absence of terms
    vectorizer = CountVectorizer(vocabulary=vocabulary, binary=True)

    # Transform the tokenized URLs into a feature matrix
    feature_matrix = vectorizer.transform(data['url'])

    # Convert the feature matrix to a DataFrame for better readability
    feature_matrix_df = pd.DataFrame(data=feature_matrix.toarray(), columns=vectorizer.get_feature_names_out())

    return feature_matrix_df

# Model Training and Evaluation
def train_and_evaluate_model(feature_matrix, labels):
    """
    Train a Random Forest classifier and evaluate its performance.

    Args:
    feature_matrix (pd.DataFrame): The feature matrix.
    labels (pd.Series): The target labels (statuses).

    Returns:
    RandomForestClassifier: The trained Random Forest classifier.
    float: The accuracy of the model.
    np.ndarray: The confusion matrix.
    """
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(feature_matrix, labels, test_size=0.2, random_state=42)

    # Create a Random Forest classifier
    clf = RandomForestClassifier()

    # Train the model
    clf.fit(X_train, y_train)

    # Make predictions
    y_pred = clf.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Compute the confusion matrix
    confusion = confusion_matrix(y_test, y_pred)

    return clf, accuracy, confusion


# Classification of a Single URL
def classify_url(url, model, feature_names):
    """
    Classify a single URL as phishing or clean.

    Args:
    url (str): The URL to classify.
    model (RandomForestClassifier): The trained Random Forest classifier.
    feature_names (list): List of feature names used in the model.

    Returns:
    str: The predicted class ('phishing' or 'clean').
    """
    # Preprocess the URL to match the format used in the feature matrix
    url_text = to_txt(url)

    # Create a feature matrix for the single URL
    single_url_features = create_feature(pd.DataFrame({'url': [url_text]}), feature_names)

    # Predict the class
    prediction = model.predict(single_url_features)

    # Map the numeric class to 'phishing' or 'clean'
    return 'phishing' if prediction[0] == 1 else 'clean'

# Main Function
def main():
    # Load the dataset
    dataset = load_data(r'C:\Users\Wolfred\Documents\Data Analysis Boot Camp\bootcamp_project4\project\project\new_data_urls.csv')
    # print(len(df[df['status']==0]), len(df[df['status']==1]))
    df_maj, df_min = dataset[dataset['status']==1], dataset[dataset['status']==0]
    df_maj_sampled = df_maj.sample(len(df_min), random_state=42)
    df_balanced = pd.concat([df_maj_sampled, df_min])
    # print(len(df_balanced[df_balanced['status']==0]), len(df_balanced[df_balanced['status']==1]))
    df_balanced.reset_index(inplace=True, drop=True)


    # Define the number of top terms to extract
    top_n_terms = 10

    # Preprocess data and extract features
    feature_matrix = preprocess_and_extract_features(df_balanced, top_n_terms)

    # Train and evaluate the Random Forest model
    model, accuracy, confusion = train_and_evaluate_model(feature_matrix, df_balanced['status'])

    print(f"Accuracy: {accuracy}")
    print("Confusion Matrix:")
    print(confusion)


    # Interactive classification
    while True:
        url_to_classify = input("Enter a URL to classify (or 'exit' to quit): ")
        if url_to_classify.lower() == 'exit':
            break

        classification = classify_url(url_to_classify, model, feature_matrix.columns)
        print(f"Classification for '{url_to_classify}': {classification}")



In [2]:
main()

Accuracy: 0.8884444247529953
Confusion Matrix:
[[69176  9828]
 [ 7797 71192]]


In [3]:
# Main Function
def main():
    # Load the dataset
    dataset = load_data(r'C:\Users\Wolfred\Documents\Data Analysis Boot Camp\bootcamp_project4\project\project\new_data_urls.csv')
    # print(len(df[df['status']==0]), len(df[df['status']==1]))
    df_maj, df_min = dataset[dataset['status']==1], dataset[dataset['status']==0]
    df_maj_sampled = df_maj.sample(len(df_min), random_state=42)
    df_balanced = pd.concat([df_maj_sampled, df_min])
    # print(len(df_balanced[df_balanced['status']==0]), len(df_balanced[df_balanced['status']==1]))
    df_balanced.reset_index(inplace=True, drop=True)


load_data(r'C:\Users\Wolfred\Documents\Data Analysis Boot Camp\bootcamp_project4\project\project\new_data_urls.csv')

Unnamed: 0,url,status
0,0000111servicehelpdesk.godaddysites.com,0
1,000011accesswebform.godaddysites.com,0
2,00003.online,0
3,0009servicedeskowa.godaddysites.com,0
4,000n38p.wcomhost.com,0
...,...,...
822005,zzufg.com,0
822006,zzu.li,0
822007,zzz.co.uk,0
822008,zzzoolight.co.za,0
