In [32]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold

IMPORTING ALL THE REQUIRED LIBRARIES

In [33]:
average_overall_error_rate=[0,0,0,0,0]
average_false_positive_rate=[0,0,0,0,0]
average_false_negative_rate=[0,0,0,0,0]

WE CREATE AN 'average_overall_error_rate' LIST SO AS TO KEEP TRACK OF THE PERFORMANCE OF EACH CLASSIFICATION ALGORITHM

In [34]:
dataset=pd.read_csv('spambase_data.csv')
X = dataset.iloc[:,:-1].values
Y = dataset.iloc[:,-1].values

IMPORTING THE DATASET
AND STORING ALL THE INDEPENDENT VARIABLES IN 'X' (This contains the 57 features of all 4601 mails,we will use to predict whether the mail is spam or not)
AND STORING THE DEPENDENT VARIABLE IN 'Y' (This contains the answer in terms of 1 or 0, denoting whether the mail was spam or not, for all the 4601 mails)

In [35]:
imputer = SimpleImputer(missing_values=np.nan, strategy = 'mean')
imputer.fit(X)
X = imputer.transform(X)

THE 'SimpleImputer' FUNCTION REPLACES ALL THE MISSING VALUES IN THE DATASET WITH THE MEAN OF ALL THE VALUES IN THAT COLUMN. (This gives us a rough estimate of what the value could be and also prevents the classification model from not deriving important relationships between features and the output due to missing values)
THE NEXT TO LINES ARE REQUIRED TO APPLY THE IMPUTER FUNCTION ON OUR INDEPENDENT VARIABLES AND THEN STORE THE UPDATED VALUES BACK IN 'X'

In [36]:
Classification_Algorithms = ['Logistic_Regression', 'K_Nearest_Neighbors', 'Support_Vector_Machine', 'Naive_Bayes', 'Random_Forest']

I STORE ALL THE ALGORITHMS I CHOOSE TO APPLY ON THE DATASET IN A LIST



In [37]:
kfold = KFold(n_splits=5, shuffle=True, random_state=1)
i=0

I CALL THE KFOLD FUNCTION SO THAT LATER I CAN USE IT TO SPLIT OUR DATASET INTO 5 FOLDS
THE 'I' VARIABLE IS INITIALIZED TO KEEP TRACK OF WHICH CLASSIFICATION ALGORITHM IS BEING USED IN A UPCOMING FOR LOOP

In [38]:
for ca in Classification_Algorithms:
        #KEEPING COUNTER TO KNOW WHICH ITERATION OF THE KFOLD SPLIT WE ARE ON
        counter=0
        print('\n')
        #print(ca)#CREATING THE CLASSIFICATION ALGORITHM
        if (ca=='Logistic_Regression'):
            classifier = LogisticRegression(random_state=1)
        elif (ca=='K_Nearest_Neighbors'):
            classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p=2)
        elif (ca=='Support_Vector_Machine'):
            classifier = SVC(kernel = 'rbf', random_state=1)
        elif (ca=='Naive_Bayes'):
            classifier = GaussianNB()
        elif (ca=='Random_Forest'):
            classifier = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state=1)
        
        for train_index, test_index in kfold.split(X):
            print('\n')
            
            counter+=1
            #print('Fold no: '+str(counter))
            #Splitting the dataset
            X_train, X_test = X[train_index], X[test_index]
            Y_train, Y_test = Y[train_index], Y[test_index]
            #FEATURE SCALING
            sc = StandardScaler()
            X_train = sc.fit_transform(X_train)
            X_test = sc.transform(X_test)
        
            #TRAINING THE CLASSIFIER        
            classifier.fit(X_train, Y_train)
        
        
            #PREDICTING THE RESULTS OF TEST SET
            y_pred = classifier.predict(X_test)
            cm = confusion_matrix(Y_test, y_pred)
            tn, fp, fn, tp = confusion_matrix(Y_test, y_pred).ravel()
            print('Algorithm: '+ca+' | Fold no: '+str(counter)+' | false_positive_rate: '+str(fp/(fp+tn))+' | false_negative_rate: '+str(fn/(fn+tp))+' | overall_error_rate: '+str((fp+fn)/(tn+fp+fn+tp)))
            average_overall_error_rate[i] += (fp+fn)/(tn+fp+fn+tp)
            average_false_positive_rate[i] += fp/(fp+tn)
            average_false_negative_rate[i] += fn/(fn+tp)
            
        average_overall_error_rate[i] /=5
        average_false_positive_rate[i] /=5
        average_false_negative_rate[i] /=5
        i+=1






Algorithm: Logistic_Regression | Fold no: 1 | false_positive_rate: 0.040780141843971635 | false_negative_rate: 0.12324929971988796 | overall_error_rate: 0.0727470141150923


Algorithm: Logistic_Regression | Fold no: 2 | false_positive_rate: 0.0299625468164794 | false_negative_rate: 0.15025906735751296 | overall_error_rate: 0.08043478260869565


Algorithm: Logistic_Regression | Fold no: 3 | false_positive_rate: 0.06398537477148081 | false_negative_rate: 0.12332439678284182 | overall_error_rate: 0.08804347826086957


Algorithm: Logistic_Regression | Fold no: 4 | false_positive_rate: 0.05226480836236934 | false_negative_rate: 0.11271676300578035 | overall_error_rate: 0.075


Algorithm: Logistic_Regression | Fold no: 5 | false_positive_rate: 0.050966608084358524 | false_negative_rate: 0.06267806267806268 | overall_error_rate: 0.05543478260869565




Algorithm: K_Nearest_Neighbors | Fold no: 1 | false_positive_rate: 0.05673758865248227 | false_negative_rate: 0.1400560224089636 | overall

I RUN A LOOP FOR EACH ALGORITHM WE WISH TO APPLY ON OUR DATASET.
THE COUNTER VARIABLE IS USED TO KEEP TRACK OF WHICH FOLD IS CURRENTLY THE TEST SET. FOR EXAMPLE: IF FOLD = 1, THEN FOLD 1 IS THE TEST SET WHILE THE REST OF THE FOLD ARE IN THE TRAINING SET.
I USE IF STATEMENT TO SEE WHICH CLASSIFICATION ALGORITHM IS TO BE PERFORMED CURRENTLY ON THE DATASET AND ACCORDINGLY I CALL THE ASSOCIATED FUNCTION.
THEN TO DIVIDE OUR 'X' AND 'Y' VARIABLE INTO TRAINING AND TESTING SET FOR EACH FOLD I UTILISE THE KFOLD SPLIT FUNCTION .
THE COUNTER IS INCREMENTED IN EACH LOOP OF KFOLD SPLIT TO KEEP TRACK OF WHICH ROUND OF FOLD IS CURRENTLY GOING ON.
THEN WE USE THE OUTPUT OF THE KFOLD SPLIT FUNCTION, TO DIVIDE OUR X AND Y VARIABLE INTO TRAINING AND TESTING SET 
(THE OUTPUT OF FUNCTION IS 2 VARIABLES, ONE CORRESSPONDING TO ALL THE INDEXES IN THE X AND Y VARIABLE THAT ARE PART OF THE TRAINING SET, THE OTHER CORRESSPONDING TO ALL THE INDEXES IN THE X AND Y VARIABLE THAT ARE PART OF THE TESTING SET  )
THEN I APPLY FEATURE SCALING ON THE TRAINING PART OF THE X VARIABLE SO AS TO CHANGE THE RANGE OF THE VALUES STORED IN IT, WHICH IN TURN HELPS THE MODEL TRAIN BETTER
AFTER THIS I USE THE SAME SCALING TECHNIQUE ON THE TEST PART OF THE X VARIABLE SO THAT MODEL CAN USE IT PREVIOUS KNOWLEDGE PROPERLY WHILE PREDICTING OUTPUT FOR THESE VALUES.
NOW I TRAIN THE CLASSIFIER BASED ON THE TRAINING PORTION OF THE X AND Y VARIABLE.
AFTER WHICH I USE THE TRAIN MODEL TO PREDICT THE OUTPUT FOR THE TEST PORTION OF THE X VARIABLES
I USE THIS PREDICTION TO DEVELOP A CONFUSION MATRIX ALONG WITH THE ACTUAL OUTPUT OF THE TEST(I.E, THE TEST PORTION OF THE Y VARIABLES)
THEN I USE THE CONFUSION MATRIX TO RETRIVE THE TRUE POSITVE, FALSE POSITIVE, TRUE NEGATIVE AND FALSE NEGATIVE FOR OUR PREDICTION.
I USE THIS TO CALCULATE THE FALSE_POSITIVE_RATE, FALSE_NEGATIVE_RATE, OVERALL_ERROR_RATE OF OUR ALGORITHM FOR A PARTICULAR FOLD AND THEN PRINT THOSE RESULTS IN A TABULAR FASION.
I ALSO STORE THESE VALUES IN OUR LISTS THAT WE HAD INITALIZED EARLIER TO KEEP TRACK OF ALL THE ALGORITHMS OVERALL PERFORMANCE.
AFTER EXITING THE KFOLD LOOP I DIVIDE THE LIST VALUES BY 5 SO AS TO GET THE AVERAGE PERFORMANCE OF EACH ALGORITHM ACROSS 5 FOLDS.

In [39]:
print('\n')        
min_value = min(average_overall_error_rate)
min_index = average_overall_error_rate.index(min_value)
print('Best Classification Algorithm: '+Classification_Algorithms[min_index]+' | Average Overall Error Rate: '+str(average_overall_error_rate[min_index])+' | Average False Positive Rate: '+str(average_false_positive_rate[min_index])+' | Average False Negative Rate: '+str(average_false_negative_rate[min_index]))




Best Classification Algorithm: Random_Forest | Average Overall Error Rate: 0.04433885663031676 | Average False Positive Rate: 0.027362256473534914 | Average False Negative Rate: 0.07019545280323523


I DECIDED TO CHOOSE THE ALGORITHM THAT HAD THE LOWEST AVERAGE OVERALL ERROR.
I USED THE MIN FUNCTION TO FIND THE LOWEST VALUE IN THE 'average_overall_error_rate' LIST
THEN I USED THIS VALUE TO FIND ITS INDEX POSITION IN THE LIST.
USING THE INDEX POSITION I PRINT THE BEST PERFORMING ALGORITHM AND ITS OVERALL RESULST