In [67]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from scipy.sparse import hstack

In [68]:
datapath="F:/college/bca_college/sixth sem/project/code/backend/url_dataset.csv"
df=pd.read_csv(datapath)
df.head()

Unnamed: 0,URL,Domain,Subdomain,TLD,Path,Length,NumSpecialChars,Label
0,http://www.linux.ie/mailman/listinfo/ilug,linux,www,ie,/mailman/listinfo/ilug,41,8,spam
1,http://www.outsrc-em.com,outsrc-em,www,com,,24,6,spam
2,http://xent.com/mailman/listinfo/fork,xent,,com,/mailman/listinfo/fork,37,7,spam
3,www.freeedgar.com,freeedgar,www,com,,17,2,spam
4,http://members.tripod.de/mani20/index1.html,tripod,members,de,/mani20/index1.html,43,8,spam


In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30501 entries, 0 to 30500
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   URL              30501 non-null  object
 1   Domain           30501 non-null  object
 2   Subdomain        27908 non-null  object
 3   TLD              28973 non-null  object
 4   Path             25756 non-null  object
 5   Length           30501 non-null  int64 
 6   NumSpecialChars  30501 non-null  int64 
 7   Label            30501 non-null  object
dtypes: int64(2), object(6)
memory usage: 1.9+ MB


In [70]:
df.describe()

Unnamed: 0,Length,NumSpecialChars
count,30501.0,30501.0
mean,46.659126,9.279302
std,23.961886,4.990836
min,9.0,1.0
25%,27.0,7.0
50%,45.0,9.0
75%,60.0,10.0
max,637.0,217.0


In [71]:
print(df.duplicated().sum())
df=df.drop_duplicates()
df.duplicated().sum()

15829


np.int64(0)

In [72]:
df.isna().sum()

URL                   0
Domain                0
Subdomain          1523
TLD                 870
Path               2949
Length                0
NumSpecialChars       0
Label                 0
dtype: int64

In [73]:
df["Subdomain"] = df["Subdomain"].fillna("")
df["TLD"] = df["TLD"].fillna("")
df["Path"] = df["Path"].fillna("")

In [74]:
x=df.drop('Label',axis=1)
y=np.array([1 if lbl=="spam" else 0 for lbl in df["Label"]])

In [75]:
vec_URL=TfidfVectorizer()
vec_dom=TfidfVectorizer()
vec_subDom=TfidfVectorizer()
vec_TLD=TfidfVectorizer()
vec_path=TfidfVectorizer()

In [76]:
x_URL=vec_URL.fit_transform(x["URL"])
x_dom=vec_dom.fit_transform(x["Domain"])
x_subDom=vec_subDom.fit_transform(x["Subdomain"])
x_TLD=vec_TLD.fit_transform(x["TLD"])
x_path=vec_path.fit_transform(x["Path"])

In [77]:
numeric_features = x[['Length', 'NumSpecialChars']].values
scaler=StandardScaler()
x_numeric = scaler.fit_transform(numeric_features)

x = hstack([x_URL, x_dom, x_subDom, x_TLD,x_path, x_numeric])


In [78]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((11737, 36993), (2935, 36993), (11737,), (2935,))

In [79]:
lr_clf=LogisticRegression(max_iter=148)
lr_clf.fit(x_train,y_train)
y_pred=lr_clf.predict(x_test)
print('Logistic Regression')
print('\tPrecision:',precision_score(y_test,y_pred))
print('\tRecall:',recall_score(y_test,y_pred))
print('\tF1:',f1_score(y_test,y_pred))

Logistic Regression
	Precision: 0.9391304347826087
	Recall: 0.8517350157728707
	F1: 0.8933002481389578


In [80]:
dt_clf=DecisionTreeClassifier()
dt_clf.fit(x_train,y_train)
y_pred_dt=dt_clf.predict(x_test)
print('Decision Tree')
print('\tPrecision:',precision_score(y_test,y_pred_dt))
print('\tRecall:',recall_score(y_test,y_pred_dt))
print('\tF1:',f1_score(y_test,y_pred_dt))

Decision Tree
	Precision: 0.8722689075630252
	Recall: 0.8186119873817035
	F1: 0.8445890968266884


In [81]:
rf_clf=RandomForestClassifier()
rf_clf.fit(x_train,y_train)
y_pred_rf=rf_clf.predict(x_test)
print('Random Forest')
print('\tPrecision:',precision_score(y_test,y_pred_rf))
print('\tRecall:',recall_score(y_test,y_pred_rf))
print('\tF1:',f1_score(y_test,y_pred_rf))

Random Forest
	Precision: 0.9508506616257089
	Recall: 0.7933753943217665
	F1: 0.8650042992261393


In [82]:
import joblib

joblib.dump(lr_clf,'url_model.pkl')
joblib.dump(vec_subDom,'url_subDomain_vectorizer.pkl')
joblib.dump(vec_TLD,'url_TLD_vectorizer.pkl')
joblib.dump(vec_URL,'URL_vectorizer.pkl')
joblib.dump(vec_path,'url_path_vectorizer.pkl')
joblib.dump(vec_dom,'url_domain_vectorizer.pkl')
joblib.dump(scaler,'url_scaler.pkl')

['url_scaler.pkl']