In [1]:
import os

In [2]:
%pwd

'c:\\Users\\USER\\Documents\\JupyterNB\\ineuron\\Python Advanced\\Phishing_detection\\notebooks'

In [3]:
os.chdir('../')
%pwd

'c:\\Users\\USER\\Documents\\JupyterNB\\ineuron\\Python Advanced\\Phishing_detection'

In [4]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataProcessingConfig:
    root_dir: Path
    final_data_file: Path

In [5]:
from src.phishingdetection.constants import *
from src.phishingdetection.utils.common_functionality import read_yaml, create_directories

In [6]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_processing_config(self) -> DataProcessingConfig:
        config = self.config.data_processing

        create_directories([config.root_dir])

        data_processing_config = DataProcessingConfig(
            root_dir=config.root_dir,
            final_data_file = config.final_data_file
            
        )

        return data_processing_config

In [7]:
from src.phishingdetection.utils.common_functionality import get_size, save_object, load_object, featureExtraction
from src.phishingdetection import logger
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [8]:
class DataProcessing:
    def __init__(self, config: DataProcessingConfig):
        self.config = config


    
    def data_cleaning(self,phishing_data_path,legit_data_path):

        df1 = pd.read_csv(phishing_data_path)
        if df1['url'].isnull().sum()!=0:
            df1 = df1.dropna(axis=0)
        
        df1 = df1.drop_duplicates()
        phishing_df = df1.sample(n=100,random_state=12).copy()
        phishing_df = phishing_df.reset_index(drop=True)

        legit_df = pd.read_csv(legit_data_path)

        
        df2 = pd.read_csv(legit_data_path)
        df2.columns = ['URLs']
        if df2['URLs'].isnull().sum()!=0:
            df2 = df2.dropna(axis=0)
        
        df2 = df2.drop_duplicates()
        legit_df = df2.sample(n=100,random_state=12).copy()
        legit_df = legit_df.reset_index(drop=True)

        logger.info('Data cleaning done')


        return phishing_df,legit_df


    def feature_extraction(self,phishing_df,legit_df):

        feature_names = ['Domain', 'Have_IP', 'Have_At', 'URL_Length', 'URL_Depth','Redirection', 
                      'https_Domain', 'TinyURL', 'Prefix/Suffix', 'DNS_Record', 'Web_Traffic', 'Label']
        
        
        try:
            legi_features = []
            label = 0
            for i in range(0, 100):
                url = legit_df['URLs'][i]
                legi_features.append(featureExtraction(url,label))
            
            legitimate = pd.DataFrame(legi_features, columns= feature_names)

        except Exception as e:
            logger.info(e)


        try:
            phish_features = []
            label = 1
            for i in range(0, 100):
                url = phishing_df['url'][i]
                phish_features.append(featureExtraction(url,label))
        
            phishing = pd.DataFrame(phish_features, columns= feature_names)

        except Exception as e:
            logger.info(e)

        
        urldata = pd.concat([legitimate, phishing]).reset_index(drop=True)

        X = urldata.drop(['Domain','Label'],axis=1)
        y = urldata['Label']

        X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

        scaler = StandardScaler()

        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        final_data = {
            'X_train': X_train_scaled,
            'y_train':y_train,
            'X_test': X_test_scaled,
            'y_test': y_test
        }

        if not os.path.exists(self.config.final_data_file):
            save_object(self.config.final_data_file, final_data)
            logger.info('Final data is saved!')

        else:
            logger.info(f"File already exists of size: {get_size(Path(self.config.local_data_file))}")


        

        





In [9]:
try:
    config = ConfigurationManager()
    
    data_processing_config = config.get_data_processing_config()
    data_processing = DataProcessing(config=data_processing_config)
    phishing_df, legit_df = data_processing.data_cleaning(config.config.data_ingestion.phishing_data_file, config.config.data_ingestion.legit_data_file)
    data_processing.feature_extraction(phishing_df, legit_df)
   
except Exception as e:
    raise e

[2023-07-20 19:29:26,837: INFO: common_functionality: yaml file: config\config.yaml loaded successfully]
[2023-07-20 19:29:26,847: INFO: common_functionality: yaml file: params.yaml loaded successfully]
[2023-07-20 19:29:26,848: INFO: common_functionality: created directory at: artifacts]
[2023-07-20 19:29:26,865: INFO: common_functionality: created directory at: artifacts/data_processing]
[2023-07-20 19:29:27,280: INFO: 110456258: Data cleaning done]
[2023-07-20 19:36:18,766: INFO: 110456258: Final data is saved!]
