In [1]:
base_url = "..//dataset//"

# Loading libraries

In [2]:
import pandas as pd
import numpy as np

pd.pandas.set_option('display.max_columns',None)

# Ingesting the data

In [3]:
df = pd.read_csv(base_url+'phishing_full.csv')

In [4]:
top_features = ['directory_length', 'time_domain_activation', 'length_url',
       'file_length', 'qty_slash_url', 'qty_plus_directory', 'domain_length',
       'qty_vowels_domain', 'qty_asterisk_directory', 'qty_hyphen_directory',
       'qty_dot_domain', 'qty_underline_directory', 'qty_percent_directory',
       'qty_dot_url', 'qty_hyphen_url', 'qty_hyphen_file', 'qty_hyphen_domain',
       'params_length', 'qty_underline_url', 'qty_tld_url', 'qty_plus_params',
       'qty_percent_url', 'qty_equal_params', 'qty_dot_params',
       'qty_percent_params', 'qty_underline_params']
top_features.append('phishing')

In [5]:
df = df[top_features]

# Schema Validation

### Creating the schema

In [6]:
df.dtypes.values

array([dtype('int64'), dtype('int64'), dtype('int64'), dtype('int64'),
       dtype('int64'), dtype('int64'), dtype('int64'), dtype('int64'),
       dtype('int64'), dtype('int64'), dtype('int64'), dtype('int64'),
       dtype('int64'), dtype('int64'), dtype('int64'), dtype('int64'),
       dtype('int64'), dtype('int64'), dtype('int64'), dtype('int64'),
       dtype('int64'), dtype('int64'), dtype('int64'), dtype('int64'),
       dtype('int64'), dtype('int64'), dtype('int64')], dtype=object)

In [7]:
data_types = [str(i)for i in df.dtypes.values]

In [8]:
data_types

['int64',
 'int64',
 'int64',
 'int64',
 'int64',
 'int64',
 'int64',
 'int64',
 'int64',
 'int64',
 'int64',
 'int64',
 'int64',
 'int64',
 'int64',
 'int64',
 'int64',
 'int64',
 'int64',
 'int64',
 'int64',
 'int64',
 'int64',
 'int64',
 'int64',
 'int64',
 'int64']

In [9]:
columns = df.columns

In [10]:
current_schema = dict(zip(columns,data_types))

In [11]:
current_schema

{'directory_length': 'int64',
 'time_domain_activation': 'int64',
 'length_url': 'int64',
 'file_length': 'int64',
 'qty_slash_url': 'int64',
 'qty_plus_directory': 'int64',
 'domain_length': 'int64',
 'qty_vowels_domain': 'int64',
 'qty_asterisk_directory': 'int64',
 'qty_hyphen_directory': 'int64',
 'qty_dot_domain': 'int64',
 'qty_underline_directory': 'int64',
 'qty_percent_directory': 'int64',
 'qty_dot_url': 'int64',
 'qty_hyphen_url': 'int64',
 'qty_hyphen_file': 'int64',
 'qty_hyphen_domain': 'int64',
 'params_length': 'int64',
 'qty_underline_url': 'int64',
 'qty_tld_url': 'int64',
 'qty_plus_params': 'int64',
 'qty_percent_url': 'int64',
 'qty_equal_params': 'int64',
 'qty_dot_params': 'int64',
 'qty_percent_params': 'int64',
 'qty_underline_params': 'int64',
 'phishing': 'int64'}

#### Dumping in yaml so that can be used in schema

In [12]:
import os
import yaml

In [13]:
os.chdir("..")

In [14]:
with open(r"column_types.yaml",'w') as file:
    yaml.safe_dump(current_schema,file)

## Building configuration for Data Validation

In [15]:
from phishing_domain_detection.constants import *


In [16]:
from phishing_domain_detection.config.configuration import Configuration

In [17]:
config = Configuration()

In [18]:
data_validation_config = config.get_data_validation_config()

In [19]:
data_validation_config

DataValidationConfig(schema_file_path='c:\\Users\\rachi\\Desktop\\inueuron\\PERSONAL PROJECTS\\Phishing-Domain-Detection-using-Machine-Learning\\config\\schema.yaml', report_file_path='c:\\Users\\rachi\\Desktop\\inueuron\\PERSONAL PROJECTS\\Phishing-Domain-Detection-using-Machine-Learning\\phishing_domain_detection\\artifacts\\data_validation\\2022-08-17_16-34-46\\report.json', report_page_file_path='c:\\Users\\rachi\\Desktop\\inueuron\\PERSONAL PROJECTS\\Phishing-Domain-Detection-using-Machine-Learning\\phishing_domain_detection\\artifacts\\data_validation\\2022-08-17_16-34-46\\report.html')

## Reading the schema file

In [20]:
from phishing_domain_detection.util.util import *

In [21]:
desired_schema = read_yaml_file(data_validation_config.schema_file_path)

In [22]:
desired_schema

{'columns': {'directory_length': 'int64',
  'domain_length': 'int64',
  'file_length': 'int64',
  'length_url': 'int64',
  'params_length': 'int64',
  'qty_asterisk_directory': 'int64',
  'qty_dot_domain': 'int64',
  'qty_dot_params': 'int64',
  'qty_dot_url': 'int64',
  'qty_equal_params': 'int64',
  'qty_hyphen_directory': 'int64',
  'qty_hyphen_domain': 'int64',
  'qty_hyphen_file': 'int64',
  'qty_hyphen_url': 'int64',
  'qty_percent_directory': 'int64',
  'qty_percent_params': 'int64',
  'qty_percent_url': 'int64',
  'qty_plus_directory': 'int64',
  'qty_plus_params': 'int64',
  'qty_slash_url': 'int64',
  'qty_tld_url': 'int64',
  'qty_underline_directory': 'int64',
  'qty_underline_params': 'int64',
  'qty_underline_url': 'int64',
  'qty_vowels_domain': 'int64',
  'time_domain_activation': 'int64'},
 'target_column': 'phishing'}

In [23]:
desired_schema.keys()

dict_keys(['columns', 'target_column'])

In [24]:
from phishing_domain_detection.entity.config_entity import  DataValidationConfig, DataIngestionConfig

In [25]:
from phishing_domain_detection.component.data_ingestion import DataIngestion
from phishing_domain_detection.config.configuration import  Configuration

In [26]:
config = Configuration()

In [27]:
di_config = config.get_data_ingestion_config()

In [28]:
di = DataIngestion(di_config)

In [29]:
data_ingestion_artifact = di.initiate_data_ingestion()

In [30]:
data_ingestion_artifact.train_file_path

'c:\\Users\\rachi\\Desktop\\inueuron\\PERSONAL PROJECTS\\Phishing-Domain-Detection-using-Machine-Learning\\phishing_domain_detection\\artifacts\\data_ingestion\\2022-08-17_16-34-46\\ingested_data\\train\\phishing_full'


# Ingesting Training and Testing Files and comparing with our custom schema

In [31]:
import pandas as pd

In [32]:
ingested_train_df = pd.read_csv(data_ingestion_artifact.train_file_path)

In [33]:
ingested_train_df

Unnamed: 0,directory_length,time_domain_activation,length_url,file_length,qty_slash_url,qty_plus_directory,domain_length,qty_vowels_domain,qty_asterisk_directory,qty_hyphen_directory,qty_dot_domain,qty_underline_directory,qty_percent_directory,qty_dot_url,qty_hyphen_url,qty_hyphen_file,qty_hyphen_domain,params_length,qty_underline_url,qty_tld_url,qty_plus_params,qty_percent_url,qty_equal_params,qty_dot_params,qty_percent_params,qty_underline_params,phishing
0,-1,6659,26,-1,0,-1,26,7,-1,-1,2,-1,-1,2,0,-1,0,-1,0,1,-1,0,-1,-1,-1,-1,0
1,1,7078,19,0,1,0,18,5,0,0,2,0,0,2,0,0,0,-1,0,1,-1,0,-1,-1,-1,-1,0
2,19,1233,32,0,4,0,13,6,0,0,1,0,0,1,1,0,1,-1,0,1,-1,0,-1,-1,-1,-1,1
3,1,3519,14,0,1,0,13,4,0,0,1,0,0,1,1,0,1,-1,0,1,-1,0,-1,-1,-1,-1,0
4,1,3460,27,0,1,0,26,8,0,0,3,0,0,3,0,0,0,-1,0,1,-1,0,-1,-1,-1,-1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70912,1,-1,15,0,1,0,14,5,0,0,2,0,0,2,0,0,0,-1,0,1,-1,0,-1,-1,-1,-1,0
70913,-1,6767,15,-1,0,-1,15,4,-1,-1,2,-1,-1,2,0,-1,0,-1,0,1,-1,0,-1,-1,-1,-1,0
70914,56,88,77,0,6,0,21,6,0,3,1,0,0,2,3,0,0,-1,0,1,-1,0,-1,-1,-1,-1,1
70915,28,1914,48,9,5,0,20,6,0,0,1,0,0,2,0,0,0,-1,0,1,-1,0,-1,-1,-1,-1,1


## Checking column number

In [34]:
len(ingested_train_df.columns) == len(desired_schema['columns'])+1

True

## Checking if schemas are same

In [35]:
desired_schema_with_target = set(desired_schema['columns'])
desired_schema_with_target.add(desired_schema['target_column'])
set(ingested_train_df.columns) == desired_schema_with_target

True

In [36]:
from phishing_domain_detection.component.data_validation import DataValidation

In [39]:
dv = DataValidation(data_ingestion_artifact=data_ingestion_artifact, data_validation_config=config.get_data_validation_config(), training_pipeline_config=config.get_training_pipeline_config())

In [40]:
dv.does_train_test_file_exists()

In [41]:
dv.validate_schema()

In [42]:
dv.validate_schema()

# Data Drift

### Oldest Ingested Data

In [43]:
from phishing_domain_detection.constants import *

In [44]:
DATA_INGESTION_ARTIFACT_DIR_KEY

'data_ingestion'

In [45]:
training_pipeline_config = config.get_training_pipeline_config()

In [46]:
data_ingestion_dir = os.path.join(training_pipeline_config.artifact_dir, DATA_INGESTION_ARTIFACT_DIR_KEY)

In [47]:
data_ingestion_dir

'c:\\Users\\rachi\\Desktop\\inueuron\\PERSONAL PROJECTS\\Phishing-Domain-Detection-using-Machine-Learning\\phishing_domain_detection\\artifacts\\data_ingestion'

In [48]:
ingestions = os.listdir(data_ingestion_dir)

In [49]:
import datetime

In [50]:
datetime.datetime.strptime(ingestions[0],'%Y-%m-%d_%H-%M-%S')

datetime.datetime(2022, 8, 15, 1, 6, 17)

In [51]:
all_times = [datetime.datetime.strptime(ingestion,'%Y-%m-%d_%H-%M-%S') for ingestion in ingestions]

In [52]:
oldest_time_stamp = min(all_times)

In [53]:
oldest_time_stamp

datetime.datetime(2022, 8, 15, 1, 6, 17)

In [54]:
t_s = datetime.datetime.strftime(oldest_time_stamp,'%Y-%m-%d_%H-%M-%S')

In [55]:
t_s

'2022-08-15_01-06-17'

In [56]:
data_ingestion_artifact.train_file_path

'c:\\Users\\rachi\\Desktop\\inueuron\\PERSONAL PROJECTS\\Phishing-Domain-Detection-using-Machine-Learning\\phishing_domain_detection\\artifacts\\data_ingestion\\2022-08-17_16-34-46\\ingested_data\\train\\phishing_full'

In [57]:
import re

In [58]:
oldest_train_file_path = re.sub(pattern="\d{4}-\d{2}-\d{2}_\d{2}-\d{2}-\d{2}",string=data_ingestion_artifact.train_file_path, repl=t_s)

##### Testing oldest file

In [60]:
dv.get_oldest_train_file()

Unnamed: 0,directory_length,time_domain_activation,length_url,file_length,qty_slash_url,qty_plus_directory,domain_length,qty_vowels_domain,qty_asterisk_directory,qty_hyphen_directory,qty_dot_domain,qty_underline_directory,qty_percent_directory,qty_dot_url,qty_hyphen_url,qty_hyphen_file,qty_hyphen_domain,params_length,qty_underline_url,qty_tld_url,qty_plus_params,qty_percent_url,qty_equal_params,qty_dot_params,qty_percent_params,qty_underline_params,phishing
0,-1,6659,26,-1,0,-1,26,7,-1,-1,2,-1,-1,2,0,-1,0,-1,0,1,-1,0,-1,-1,-1,-1,0
1,1,7078,19,0,1,0,18,5,0,0,2,0,0,2,0,0,0,-1,0,1,-1,0,-1,-1,-1,-1,0
2,19,1233,32,0,4,0,13,6,0,0,1,0,0,1,1,0,1,-1,0,1,-1,0,-1,-1,-1,-1,1
3,1,3519,14,0,1,0,13,4,0,0,1,0,0,1,1,0,1,-1,0,1,-1,0,-1,-1,-1,-1,0
4,1,3460,27,0,1,0,26,8,0,0,3,0,0,3,0,0,0,-1,0,1,-1,0,-1,-1,-1,-1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70912,1,-1,15,0,1,0,14,5,0,0,2,0,0,2,0,0,0,-1,0,1,-1,0,-1,-1,-1,-1,0
70913,-1,6767,15,-1,0,-1,15,4,-1,-1,2,-1,-1,2,0,-1,0,-1,0,1,-1,0,-1,-1,-1,-1,0
70914,56,88,77,0,6,0,21,6,0,3,1,0,0,2,3,0,0,-1,0,1,-1,0,-1,-1,-1,-1,1
70915,28,1914,48,9,5,0,20,6,0,0,1,0,0,2,0,0,0,-1,0,1,-1,0,-1,-1,-1,-1,1


# Checking Drift present or not with report.json file

In [71]:
path = "phishing_domain_detection\\artifacts\\data_validation\\2022-08-17_17-17-44\\report.json"

In [72]:
import json

In [76]:
drift_report = json.load(open(path))

In [78]:
drift_report.keys()

dict_keys(['data_drift', 'timestamp'])

In [84]:
drift_report['data_drift']['data']['metrics']['dataset_drift']

False