In [1]:
import pandas as pd
import sklearn
import os
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm , preprocessing
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier



In [2]:
def mean_score(scoring):
    return {i:j.mean() for i,j in scoring.items()}

In [3]:
# Get the parent directory of the current folder
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

# Specify the file path for dataset.csv in the parent folder
filename = os.path.join(parent_dir, "dataset.csv")
df = pd.read_csv(filename)

In [4]:
df.columns 

Index(['Current Domain Age', 'Matching Domain Name', 'Length of Domain',
       'Result'],
      dtype='object')

In [5]:
# Investigate unique values in the each column
unique_Domain_Age = df['Current Domain Age'].unique()
unique_Domain_Name = df['Matching Domain Name'].unique()
unique_Length_Domain = df['Length of Domain'].unique()
unique_R = df['Result'].unique()

print(unique_Domain_Age)
print(unique_Domain_Name)
print(unique_Length_Domain)
print(unique_R)

['1' '-1' '0' 'Current Domain Age']
['1' '-1' '0' 'Matching Domain Name']
['1' '-1' '0' 'Length of Domain']
['1' 'Result' '-1']


In [6]:
validValues = {-1, 0, 1}

for column in df.columns:
    df[column] = pd.to_numeric(df[column], errors='coerce')
    df = df[df[column].isin(validValues)]

# Convert all columns to integers (again after the filtering)
df = df.astype(int)

In [7]:
# Now you can use the DataFrame with valid integer values (1, -1, and 0)
print(df)

      Current Domain Age  Matching Domain Name  Length of Domain  Result
0                      1                     1                 1       1
1                      1                    -1                 1       1
2                      1                    -1                 1       1
3                     -1                    -1                -1       1
4                      1                    -1                 1       1
...                  ...                   ...               ...     ...
5702                   1                    -1                 1      -1
5703                   1                    -1                 1      -1
5704                   0                     0                 0      -1
5705                   1                    -1                 1      -1
5706                   1                    -1                 1      -1

[5706 rows x 4 columns]


In [8]:
df.shape

(5706, 4)

In [9]:
df = sklearn.utils.shuffle(df)
X = df.drop("Result",axis=1).values
X = preprocessing.scale(X)
y = df['Result'].values
df.head()

Unnamed: 0,Current Domain Age,Matching Domain Name,Length of Domain,Result
2154,1,-1,1,1
2254,1,1,1,1
324,1,-1,1,1
2886,1,-1,1,1
3424,1,-1,1,-1


In [10]:
scoring = {'accuracy': 'accuracy',
           'recall': 'recall',
           'precision': 'precision',
           'f1': 'f1'}
fold_count=10

## SVM-Linear

In [11]:
linear_clf = svm.SVC(kernel='linear')
cross_val_scores = cross_validate(linear_clf, X, y, cv=fold_count, scoring=scoring)
linear_svc_clf_score = mean_score(cross_val_scores)

print(f"fit time = {linear_svc_clf_score['fit_time']}")
print(f"score time = {linear_svc_clf_score['score_time']}")
print(f"accuracy = {linear_svc_clf_score['test_accuracy']}")
print(f"recall = {linear_svc_clf_score['test_recall']}")
print(f"precision = {linear_svc_clf_score['test_precision']}")
print(f"f1 = {linear_svc_clf_score['test_f1']}")

fit time = 2.6678094148635862
score time = 0.19724135398864745
accuracy = 0.640901158324884
recall = 0.834804284972941
precision = 0.6458453040798076
f1 = 0.7280842246555534


## SVM-Polynomial

In [12]:
poly_clf = svm.SVC(kernel='poly')
cross_val_scores = cross_validate(poly_clf, X, y, cv=fold_count, scoring=scoring)
poly_svc_clf_score = mean_score(cross_val_scores)

print(f"fit time = {poly_svc_clf_score['fit_time']}")
print(f"score time = {poly_svc_clf_score['score_time']}")
print(f"accuracy = {poly_svc_clf_score['test_accuracy']}")
print(f"recall = {poly_svc_clf_score['test_recall']}")
print(f"precision = {poly_svc_clf_score['test_precision']}")
print(f"f1 = {poly_svc_clf_score['test_f1']}")

fit time = 3.0520703554153443
score time = 0.2002882719039917
accuracy = 0.6265305558115956
recall = 0.9720207205871452
precision = 0.6109828568533294
f1 = 0.749881599879628


## SVM-RBF

In [13]:
rbf_clf = svm.SVC(kernel='rbf')
cross_val_scores = cross_validate(rbf_clf, X, y, cv=fold_count, scoring=scoring)
rbf_svc_clf_score = mean_score(cross_val_scores)

print(f"fit time = {rbf_svc_clf_score['fit_time']}")
print(f"score time = {rbf_svc_clf_score['score_time']}")
print(f"accuracy = {rbf_svc_clf_score['test_accuracy']}")
print(f"recall = {rbf_svc_clf_score['test_recall']}")
print(f"precision = {rbf_svc_clf_score['test_precision']}")
print(f"f1 = {rbf_svc_clf_score['test_f1']}")


fit time = 2.7478711128234865
score time = 0.41350958347320554
accuracy = 0.6559735152241373
recall = 0.8840870338794573
precision = 0.647668702100772
f1 = 0.7475460749869663


## SVM-Sigmoid

In [14]:
sigmoid_clf = svm.SVC(kernel='sigmoid')
cross_val_scores = cross_validate(sigmoid_clf, X, y, cv=fold_count, scoring=scoring)
sigmoid_svc_clf_score = mean_score(cross_val_scores)

print(f"fit time = {sigmoid_svc_clf_score['fit_time']}")
print(f"score time = {sigmoid_svc_clf_score['score_time']}")
print(f"accuracy = {sigmoid_svc_clf_score['test_accuracy']}")
print(f"recall = {sigmoid_svc_clf_score['test_recall']}")
print(f"precision = {sigmoid_svc_clf_score['test_precision']}")
print(f"f1 = {sigmoid_svc_clf_score['test_f1']}")

fit time = 4.965294027328492
score time = 0.4147202491760254
accuracy = 0.5923587427412664
recall = 0.9224247535028542
precision = 0.5941953160911494
f1 = 0.722750028533593
