# Feature Generation

In [1]:
# Importing necessary libraries
import numpy as np
import pandas as pd
from pathlib import Path
from datetime import datetime
import re
from spam_lists import SPAMHAUS_DBL
import spf
import checkdmarc

### Read file

In [2]:
#reading the .csv file and declare datatype to avoid low memory issues
df = pd.read_csv("preprocessed.csv",dtype='unicode')

## Add new features to the dataset

###  check if the subject contains sepcial characters

In [15]:
def if_special(x):
    special_characters = "]!@#$%^&*()-+?_=,<>/["
    for c in x :
        if c in special_characters:
            return 1
    return 0

In [16]:
df['special_characters_exists_subject'] = df['Subject'].apply(if_special)

### count number of words in subject

In [17]:
def count_num_words(x):
    w = x.split(" ")
    return len(w)

In [18]:
df['number_of_words_subject'] = df['Subject'].apply(count_num_words)

### calculate total time

from hopper import Hopper


### count number of capitalized words in subject

In [19]:
def count_num_cap_words(x):
    w = x.split(" ")
    count = 0
    for i in w:
        if i.isupper():
            count+=1
    return count

In [20]:
df['number_of_capitalized_words_subject'] = df['Subject'].apply(count_num_cap_words)

### count number of capitalized characters in subject

In [21]:
def count_num_cap_char(x):
    count = 0
    for i in x:
        if i.isupper():
            count+=1
    return count

In [22]:
df['number_of_capitalized_characters_subject'] = df['Subject'].apply(count_num_cap_char)

### count number of digits in subject

In [23]:
def count_digit(x):
    count=0
    for i in x:
        if i.isdigit():
            count+=1
    return count

In [24]:
df['number_of_digits_subject'] = df['Subject'].apply(count_digit)

### count number of characters in subject

In [25]:
def count_num_char(x):
    count=0
    for i in x:
        if i.isalpha():
            count+=1
    return count

In [26]:
df['number_of_characters_subject'] = df['Subject'].apply(count_num_char)

###  Total number of spaces in Subject 

In [27]:
def count_space(x):
    count=0
    for i in x:
        if i.isspace():
            count+=1
    return count

In [28]:
df['number_of_spaces_subject'] = df['Subject'].apply(count_space)

### Total number of special characters in Subject 

In [29]:
def count_special(x):
    special_characters = "]!@#$%^&*()-+?_=,<>/["
    return len([c for c in x if c in special_characters])

In [30]:
df['number_of_special_characters_subject'] = df['Subject'].apply(count_special)

### Number of single quotes in Subject 

In [31]:
def singleQuote(x):
    count = 0
    for res in x:
        if "'" in res:
            count+=1
    save = count/2
    return save

In [32]:
df['number_of_single_Quotes_subject'] = df['Subject'].apply(singleQuote)

### Number of semi-colons in Subject 

In [33]:
def count_num_semiColon(x):
    count = 0
    for i in x:
        if ';' in i:
            count+=1
    return count


In [34]:
df['number_of_semiColon_subject'] = df['Subject'].apply(count_num_semiColon)

### Ratio upper case  / lower case in subject

In [35]:
def ratio_upperCase_lowerCae(x):

    countUpp =0
    countLow =0

    save = x.split(" ")
    for i in save:
        if i.isupper():
            countUpp+=1
        else:
            countLow+=1

    ratio = countUpp/countLow

    return ratio

In [36]:
df['ratio_of_uppercase/lowercase_words'] = df['Subject'].apply(ratio_upperCase_lowerCae)

### Total number of uppercase words 

In [37]:
def upperCase(x):
    count = 0
    save = x.split(" ")
    for i in save:
        if i.isupper():
            count+=1
    return count


In [38]:
df['Total_number_of_upperCase'] = df['Subject'].apply(upperCase)

### Max. Word length (number of characters on the longest word in Subject) 

In [39]:
def MaxWordLength(str): 
    strLen = len(str) 
    save = 0; currentLength = 0
      
    for i in range(0, strLen): 
        if (str[i] != ' '): 
            currentLength += 1
        else: 
            save = max(save, currentLength) 
            currentLength = 0

    return max(save, currentLength) 

In [40]:
df['Max_word_length_in_subject'] = df['Subject'].apply(MaxWordLength)

## Check SPF Valid

In [6]:
df['new_email'] = df['From'].str.extract(r'([\w\.-]+@[\w\.-]+)')
df['domain'] = df['new_email'].apply(str).str.split('@').str[1]


In [42]:
stored_spf = dict()
def check_spf_valid(domain):
    if(domain == ' ' or domain == '' or domain == 'nan'):
        return 0
    if(stored_spf.get(domain)==None):
        try:
            checkdmarc.get_dmarc_record(domain, nameservers=["1.1.1.1"])
            stored_spf[domain] = 1
            return 1
        except:
            stored_spf[domain] = 0
            return 0
    else:
        return stored_spf.get(domain)

In [None]:
df['spf_valid'] = df.apply(lambda row: check_spf_valid(row['domain']), axis=1)

## Checking Black list

In [6]:
stored_val = dict()
def check_blackListed(domain):
    if(domain == ' ' or domain == '' or domain == 'nan'):
        return 0
    if(stored_val.get(domain)==None):
        try:
            if(domain in SPAMHAUS_DBL):
                stored_val[domain] = 1
                return 1
            else:
                stored_val[domain]= 0
                return 0
        except:
            return 0
    else:
        return stored_val.get(domain)
   

In [None]:
df['blackListed'] = df.apply(lambda row: check_blackListed(row['domain']),axis=1)

victorytulsa.org
False
samba.org
False
auckland.ac.nz
False
plumcopper.com
False
cm.nu
False


## Validating Date

In [None]:
def validate_date():
    df['Date'] = df['Date'].str[:-2]
    #validating date after converting it to datetime
    df['new_date'] = pd.to_datetime(df['Date'],errors="coerce")
    df['validate_date'] = np.where(df['new_date']< datetime.now(), 1, 0)
validate_date()


## Length of Subject

In [None]:
def find_length_sub():
    df['Subject_length']  = df['Subject'].str.len()
find_length_sub()

### write file

In [None]:
df.to_csv('data_with_features.csv',index=False)

In [None]:
df