Cleaning and Checking Dataset

In [4]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/gdrive')

# Change directory to the desired location in Google Drive
import os
os.chdir('/content/gdrive/MyDrive')


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [5]:
import pandas as pd

# Load the CSV
df = pd.read_csv("/content/Phishing_Legitimate_full 3.csv")

# 1. Basic info
print("Dataset shape:", df.shape)
print("\nColumn details:")
print(df.info())

# 2. Missing values
print("\nMissing values per column:")
print(df.isnull().sum())

# 3. Duplicate rows
print("\nDuplicate rows count:", df.duplicated().sum())

# 4. Sample of first few rows
print("\nFirst 5 rows:")
print(df.head())

# 5. Unique values for each column (to spot weird categories)
for col in df.columns:
    print(f"\nUnique values in {col}:")
    print(df[col].unique())

# 6. Check numeric columns for unexpected strings or mixed types
numeric_cols = df.select_dtypes(include=['number']).columns
print("\nNumeric column summary:")
print(df[numeric_cols].describe())


Dataset shape: (10000, 50)

Column details:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 50 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   id                                  10000 non-null  int64  
 1   NumDots                             10000 non-null  int64  
 2   SubdomainLevel                      10000 non-null  int64  
 3   PathLevel                           10000 non-null  int64  
 4   UrlLength                           10000 non-null  int64  
 5   NumDash                             10000 non-null  int64  
 6   NumDashInHostname                   10000 non-null  int64  
 7   AtSymbol                            10000 non-null  int64  
 8   TildeSymbol                         10000 non-null  int64  
 9   NumUnderscore                       10000 non-null  int64  
 10  NumPercent                          10000 non-null  int64  
 11

In [6]:
import pandas as pd

# Load dataset
df = pd.read_csv("/content/Phishing_Legitimate_full 3.csv")

# Shape of dataset
print("Rows:", df.shape[0])
print("Columns:", df.shape[1])

# Column names
print("\nColumn Names:")
print(df.columns.tolist())

# Data types and null values
print("\nInfo:")
print(df.info())

# Missing values
print("\nMissing values per column:")
print(df.isnull().sum())

# Sample rows
print("\nSample Data:")
print(df.head())

# Class distribution (assuming target column is named 'Label')
if 'Label' in df.columns:
    print("\nClass distribution:")
    print(df['Label'].value_counts())


Rows: 10000
Columns: 50

Column Names:
['id', 'NumDots', 'SubdomainLevel', 'PathLevel', 'UrlLength', 'NumDash', 'NumDashInHostname', 'AtSymbol', 'TildeSymbol', 'NumUnderscore', 'NumPercent', 'NumQueryComponents', 'NumAmpersand', 'NumHash', 'NumNumericChars', 'NoHttps', 'RandomString', 'IpAddress', 'DomainInSubdomains', 'DomainInPaths', 'HttpsInHostname', 'HostnameLength', 'PathLength', 'QueryLength', 'DoubleSlashInPath', 'NumSensitiveWords', 'EmbeddedBrandName', 'PctExtHyperlinks', 'PctExtResourceUrls', 'ExtFavicon', 'InsecureForms', 'RelativeFormAction', 'ExtFormAction', 'AbnormalFormAction', 'PctNullSelfRedirectHyperlinks', 'FrequentDomainNameMismatch', 'FakeLinkInStatusBar', 'RightClickDisabled', 'PopUpWindow', 'SubmitInfoToEmail', 'IframeOrFrame', 'MissingTitle', 'ImagesOnlyInForm', 'SubdomainLevelRT', 'UrlLengthRT', 'PctExtResourceUrlsRT', 'AbnormalExtFormActionR', 'ExtMetaScriptLinkRT', 'PctExtNullSelfRedirectHyperlinksRT', 'CLASS_LABEL']

Info:
<class 'pandas.core.frame.DataFram

Class Balancing

In [11]:
import pandas as pd

# Load dataset
df = pd.read_csv("/content/Phishing_Legitimate_full 3.csv")

# Check original distribution
print("📊 Original CLASS_LABEL distribution:")
print(df['CLASS_LABEL'].value_counts())


📊 Original CLASS_LABEL distribution:
CLASS_LABEL
1    5000
0    5000
Name: count, dtype: int64
