# Feature Generation

In [1]:
# Importing necessary libraries
import numpy as np
import pandas as pd
from pathlib import Path
from datetime import datetime
import pydnsbl
import re
from spam_lists import SPAMHAUS_DBL
import validators
import spf
import socket

### Read file

In [2]:
#reading the .csv file and declare datatype to avoid low memory issues
df = pd.read_csv("preprocessed.csv",dtype='unicode')

## Add new features to the dataset

###  check if the subject contains sepcial characters

In [3]:
def if_special(x):
    special_characters = "]!@#$%^&*()-+?_=,<>/["
    for c in x :
        if c in special_characters:
            return 1
    return 0

In [4]:
df['special_characters_exists_subject'] = df['Subject'].apply(if_special)

### count number of words in subject

In [5]:
def count_num_words(x):
    w = x.split(" ")
    return len(w)

In [6]:
df['number_of_words_subject'] = df['Subject'].apply(count_num_words)

### calculate total time

from hopper import Hopper


### count number of capitalized words in subject

In [7]:
def count_num_cap_words(x):
    w = x.split(" ")
    count = 0
    for i in w:
        if i.isupper():
            count+=1
    return count

In [8]:
df['number_of_capitalized_words_subject'] = df['Subject'].apply(count_num_cap_words)

### count number of capitalized characters in subject

In [9]:
def count_num_cap_char(x):
    count = 0
    for i in x:
        if i.isupper():
            count+=1
    return count

In [10]:
df['number_of_capitalized_characters_subject'] = df['Subject'].apply(count_num_cap_char)

### count number of digits in subject

In [11]:
def count_digit(x):
    count=0
    for i in x:
        if i.isdigit():
            count+=1
    return count

In [12]:
df['number_of_digits_subject'] = df['Subject'].apply(count_digit)

### count number of characters in subject

In [13]:
def count_num_char(x):
    count=0
    for i in x:
        if i.isalpha():
            count+=1
    return count

In [14]:
df['number_of_characters_subject'] = df['Subject'].apply(count_num_char)

###  Total number of spaces in Subject 

In [15]:
def count_space(x):
    count=0
    for i in x:
        if i.isspace():
            count+=1
    return count

In [16]:
df['number_of_spaces_subject'] = df['Subject'].apply(count_space)

### Total number of special characters in Subject 

In [17]:
def count_special(x):
    special_characters = "]!@#$%^&*()-+?_=,<>/["
    return len([c for c in x if c in special_characters])

In [18]:
df['number_of_special_characters_subject'] = df['Subject'].apply(count_special)

### Number of single quotes in Subject 

In [19]:
def singleQuote(x):
    count = 0
    for res in x:
        if "'" in res:
            count+=1
    save = count/2
    return save

In [20]:
df['number_of_single_Quotes_subject'] = df['Subject'].apply(singleQuote)

### Number of semi-colons in Subject 

In [21]:
def count_num_semiColon(x):
    count = 0
    for i in x:
        if ';' in i:
            count+=1
    return count


In [22]:
df['number_of_semiColon_subject'] = df['Subject'].apply(count_num_semiColon)

### Ratio upper case  / lower case in subject

In [23]:
def ratio_upperCase_lowerCae(x):

    countUpp =0
    countLow =0

    save = x.split(" ")
    for i in save:
        if i.isupper():
            countUpp+=1
        else:
            countLow+=1

    ratio = countUpp/countLow

    return ratio

In [24]:
df['ratio_of_uppercase/lowercase_words'] = df['Subject'].apply(ratio_upperCase_lowerCae)

### Total number of uppercase words 

In [25]:
def upperCase(x):
    count = 0
    save = x.split(" ")
    for i in save:
        if i.isupper():
            count+=1
    return count


In [26]:
df['Total_number_of_upperCase'] = df['Subject'].apply(upperCase)

### Max. Word length (number of characters on the longest word in Subject) 

In [27]:
def MaxWordLength(str): 
    strLen = len(str) 
    save = 0; currentLength = 0
      
    for i in range(0, strLen): 
        if (str[i] != ' '): 
            currentLength += 1
        else: 
            save = max(save, currentLength) 
            currentLength = 0

    return max(save, currentLength) 

In [28]:
df['Max_word_length_in_subject'] = df['Subject'].apply(MaxWordLength)

## Check SPF Valid

In [36]:
df['new_email'] = df['From'].str.extract(r'([\w\.-]+@[\w\.-]+)')
df['domain'] = df['new_email'].apply(str).str.split('@').str[1]

In [None]:
#trying to speed up the code
stored_ip = dict()
list_domains = df['domain'].astype(str).values.tolist()
j=0
for i in list_domains:
    j = j+1
    print(j)
    if(stored_ip.get(i)==None and i != "nan" and i != "" and i!= " "):
        try:
            stored_ip[i] = socket.gethostbyname(i)
        except:
            stored_ip[i] = "unknown"
            continue

In [None]:
import json

#writing dict to file
#with open('dictionary.txt', 'w') as file:
     #file.write(json.dumps(stored_ip))
#reading dict from file
with open('dictionary.txt') as f: 
    data = f.read()
saved_ip = json.loads(data) 
f.close()

In [None]:
def spf_validate(new_email,domain):
    if(saved_ip.get(domain) != None and saved_ip.get(domain)!="unknown"):
        try:
            i=saved_ip.get(domain)
            s=new_email
            h=domain
            temp_check = spf.check2(i,s,h)
            if(temp_check[0]== 'pass'):
                print(temp_check)
                return 1
            else:
                return 0
        except:
            return 0
    else:
        return 0
df['spf_valid'] = df.apply(lambda row: spf_validate(row['new_email'], row['domain']), axis=1)

## Checking Black list

In [None]:
def check_blackListed(domain):
    print(domain)
    try:
        if(domain == 'nan' or domain == "" or domain ==" "):
            return 0
        else:
            if(domain in SPAMHAUS_DBL):
                return 1
            else:
                return 0
    except:
        return 0

In [None]:
df['blackListed'] = df.apply(lambda row: check_blackListed(row['domain']),axis=1)

## Validating Date

In [29]:
def validate_date():
    df['Date'] = df['Date'].str[:-2]
    #validating date after converting it to datetime
    df['new_date'] = pd.to_datetime(df['Date'],errors="coerce")
    df['validate_date'] = np.where(df['new_date']< datetime.now(), True, False)
validate_date()


## Length of Subject

In [30]:
def find_length_sub():
    df['Subject_length']  = df['Subject'].str.len()
find_length_sub()

In [32]:
df

Unnamed: 0,Return-Path,Message-ID,From,Reply-To,To,Submitting Host,Subject,Date,X-Mailer,MIME-Version,...,number_of_spaces_subject,number_of_special_characters_subject,number_of_single_Quotes_subject,number_of_semiColon_subject,ratio_of_uppercase/lowercase_words,Total_number_of_upperCase,Max_word_length_in_subject,new_date,validate_date,Subject_length
0,<Phoebesiemensclaudia@victorytulsa.org>\r,unknown,"""Gilda Isaac"" <Phoebesiemensclaudia@victorytu...",unknown,theorize@plg.uwaterloo.ca\r,riffle,alan liverpool alkane\r,"Sun, 17 Jun 2007",unknown,0,...,4,0,0.0,0,0.000000,0,9,2007-06-17,True,23
1,<samba-cvs-bounces+ktwarwic=speedy.uwaterloo....,unknown,jpeach@samba.org\r,samba-technical@lists.samba.org\r,samba-cvs@samba.org\r,dp.samba.org (localhost [127.0.0.1]),svn commit,2007-06-13,unknown,0,...,2,0,0.0,0,0.000000,0,6,2007-06-13,True,11
2,<r-help-bounces@stat.math.ethz.ch>\r,<C2779203.E6%r.turner@auckland.ac.nz>\r,Rolf Turner <r.turner@auckland.ac.nz>\r,unknown,Rhelp <r-help@stat.math.ethz.ch>\r,130.216.104.237 ([130.216.104.237]) by,[R] quartz() on MAC OSX\r,"Mon, 21 May 2007",unknown,0,...,6,4,0.0,0,1.000000,3,8,2007-05-21,True,25
3,<qepcw9y7j@plumcopper.com>\r,<462809ee-qepcw9y7j@plumcopper.com>\r,"""David"" <qepcw9y7j@plumcopper.com>\r",unknown,"""Subscriber"" <producttestpanel@speedy.uwaterl...",plumcopper.com (plumcopper.com [208.66.235.190]),Final Attempt,"Thu, 19 Apr 2007",unknown,1.0\r,...,2,0,0.0,0,0.000000,0,7,2007-04-19,True,14
4,<speakup-bounces@braille.uwo.ca>\r,unknown,Shane <shane-keyword-speakup.aca783@cm.nu>\r,"""Speakup is a screen review system for Linux.""\r","""Speakup is a screen review system for Linux....",shane by continuum.cm.nu with local (Exim 4.63),Re,"Wed, 09 May 2007",unknown,0,...,1,0,0.0,0,0.000000,0,2,2007-05-09,True,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62389,<samba-technical-bounces+ktwarwic=speedy.uwat...,unknown,Luke Howard <lukeh@padl.com>\r,lukeh@padl.com\r,jerry@samba.org\r,au.padl.com (localhost.padl.com [127.0.0.1]),Re,"Fri, 20 Apr 2007",unknown,1.0\r,...,1,0,0.0,0,0.000000,0,2,2007-04-20,True,3
62390,<space_shuttle-jump-ktwarwic=speedy.uwaterloo...,<CE7E4CE628C79245A57591B3A970BB4A07E2A41C@NYC...,"""CBS Space News"" <space_shuttle-ktwarwic=spee...",unknown,ktwarwic@speedy.uwaterloo.ca\r,cbsig.com (web145.bc.cbsig.net [65.244.48.145]),4/11 1145a Update,"Wed, 11 Apr 2007",unknown,1.0\r,...,3,1,0.0,0,0.000000,0,6,2007-04-11,True,18
62391,<LoansAWBC@controldraw.net>\r,unknown,"""A.W.B.C_Loans"" <LoansAWBC@controldraw.net>\r",unknown,<producttestpanel@speedy.uwaterloo.ca>\r,controldraw.net (r01h05.chitay-zdes.ru [89.208...,FINANCE certificate enclosed\r,"Wed, 18 Apr 2007",unknown,0,...,4,0,0.0,0,0.333333,1,11,2007-04-18,True,30
62392,<samba-technical-bounces+ktwarwic=speedy.uwat...,unknown,Michael B Allen <mba2000@ioplex.com>\r,unknown,"""yang mikey"" <mikeyredmoon@gmail.com>\r",quark.foo.net (c-69-142-196-170.hsd1.nj.comcas...,Re,"Sun, 10 Jun 2007",Sylpheed 2.4.0 (GTK+ 2.10.4; i686-pc-linux-gn...,0,...,1,0,0.0,0,0.000000,0,2,2007-06-10,True,3


### write file

In [33]:
df.to_csv('data_with_features.csv',index=False)

In [34]:
df

Unnamed: 0,Return-Path,Message-ID,From,Reply-To,To,Submitting Host,Subject,Date,X-Mailer,MIME-Version,...,number_of_spaces_subject,number_of_special_characters_subject,number_of_single_Quotes_subject,number_of_semiColon_subject,ratio_of_uppercase/lowercase_words,Total_number_of_upperCase,Max_word_length_in_subject,new_date,validate_date,Subject_length
0,<Phoebesiemensclaudia@victorytulsa.org>\r,unknown,"""Gilda Isaac"" <Phoebesiemensclaudia@victorytu...",unknown,theorize@plg.uwaterloo.ca\r,riffle,alan liverpool alkane\r,"Sun, 17 Jun 2007",unknown,0,...,4,0,0.0,0,0.000000,0,9,2007-06-17,True,23
1,<samba-cvs-bounces+ktwarwic=speedy.uwaterloo....,unknown,jpeach@samba.org\r,samba-technical@lists.samba.org\r,samba-cvs@samba.org\r,dp.samba.org (localhost [127.0.0.1]),svn commit,2007-06-13,unknown,0,...,2,0,0.0,0,0.000000,0,6,2007-06-13,True,11
2,<r-help-bounces@stat.math.ethz.ch>\r,<C2779203.E6%r.turner@auckland.ac.nz>\r,Rolf Turner <r.turner@auckland.ac.nz>\r,unknown,Rhelp <r-help@stat.math.ethz.ch>\r,130.216.104.237 ([130.216.104.237]) by,[R] quartz() on MAC OSX\r,"Mon, 21 May 2007",unknown,0,...,6,4,0.0,0,1.000000,3,8,2007-05-21,True,25
3,<qepcw9y7j@plumcopper.com>\r,<462809ee-qepcw9y7j@plumcopper.com>\r,"""David"" <qepcw9y7j@plumcopper.com>\r",unknown,"""Subscriber"" <producttestpanel@speedy.uwaterl...",plumcopper.com (plumcopper.com [208.66.235.190]),Final Attempt,"Thu, 19 Apr 2007",unknown,1.0\r,...,2,0,0.0,0,0.000000,0,7,2007-04-19,True,14
4,<speakup-bounces@braille.uwo.ca>\r,unknown,Shane <shane-keyword-speakup.aca783@cm.nu>\r,"""Speakup is a screen review system for Linux.""\r","""Speakup is a screen review system for Linux....",shane by continuum.cm.nu with local (Exim 4.63),Re,"Wed, 09 May 2007",unknown,0,...,1,0,0.0,0,0.000000,0,2,2007-05-09,True,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62389,<samba-technical-bounces+ktwarwic=speedy.uwat...,unknown,Luke Howard <lukeh@padl.com>\r,lukeh@padl.com\r,jerry@samba.org\r,au.padl.com (localhost.padl.com [127.0.0.1]),Re,"Fri, 20 Apr 2007",unknown,1.0\r,...,1,0,0.0,0,0.000000,0,2,2007-04-20,True,3
62390,<space_shuttle-jump-ktwarwic=speedy.uwaterloo...,<CE7E4CE628C79245A57591B3A970BB4A07E2A41C@NYC...,"""CBS Space News"" <space_shuttle-ktwarwic=spee...",unknown,ktwarwic@speedy.uwaterloo.ca\r,cbsig.com (web145.bc.cbsig.net [65.244.48.145]),4/11 1145a Update,"Wed, 11 Apr 2007",unknown,1.0\r,...,3,1,0.0,0,0.000000,0,6,2007-04-11,True,18
62391,<LoansAWBC@controldraw.net>\r,unknown,"""A.W.B.C_Loans"" <LoansAWBC@controldraw.net>\r",unknown,<producttestpanel@speedy.uwaterloo.ca>\r,controldraw.net (r01h05.chitay-zdes.ru [89.208...,FINANCE certificate enclosed\r,"Wed, 18 Apr 2007",unknown,0,...,4,0,0.0,0,0.333333,1,11,2007-04-18,True,30
62392,<samba-technical-bounces+ktwarwic=speedy.uwat...,unknown,Michael B Allen <mba2000@ioplex.com>\r,unknown,"""yang mikey"" <mikeyredmoon@gmail.com>\r",quark.foo.net (c-69-142-196-170.hsd1.nj.comcas...,Re,"Sun, 10 Jun 2007",Sylpheed 2.4.0 (GTK+ 2.10.4; i686-pc-linux-gn...,0,...,1,0,0.0,0,0.000000,0,2,2007-06-10,True,3
