# Url Inspector using various classification and NLP

## Import Library

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import re

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
from imblearn.over_sampling import SMOTE

In [5]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import train_test_split

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB

In [7]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [8]:
import pickle

In [9]:
from sklearn.preprocessing import StandardScaler

## Import Data

In [10]:
df=pd.read_csv('dataset.csv')

In [11]:
df.head()

Unnamed: 0,url,label
0,diaryofagameaddict.com,bad
1,espdesign.com.au,bad
2,iamagameaddict.com,bad
3,kalantzis.net,bad
4,slightlyoffcenter.net,bad


In [12]:
df.shape

(420464, 2)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420464 entries, 0 to 420463
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   url     420464 non-null  object
 1   label   420464 non-null  object
dtypes: object(2)
memory usage: 6.4+ MB


In [14]:
df.isnull().sum()

url      0
label    0
dtype: int64

## Changing Values of Class 

In [15]:
def change_class(i):
    if i=='good':
        return 0
    else:
        return 1

In [16]:
df['label']=df['label'].apply(change_class)

In [17]:
df.head()

Unnamed: 0,url,label
0,diaryofagameaddict.com,1
1,espdesign.com.au,1
2,iamagameaddict.com,1
3,kalantzis.net,1
4,slightlyoffcenter.net,1


In [18]:
df['label'].value_counts()

0    344821
1     75643
Name: label, dtype: int64

## Tokenizing The Url

In [19]:
def tokenizer(i):
    p=''
    x=re.split('[/-]',i)
    if '' in x:
        x.remove('')
    for j in x:
        if j.find('.')>=0:
            y=j.split('.')
            if 'com' in y:
                y.remove('com')
            x=x+y
    return x

## Using Vectorizer

In [20]:
cf=CountVectorizer(tokenizer=tokenizer)

In [21]:
tf=TfidfVectorizer(tokenizer=tokenizer)

In [22]:
with open('vector','wb') as f:
    pickle.dump(tf,f)

In [23]:
x1=cf.fit_transform(df['url'])

In [24]:
x2=tf.fit_transform(df['url'])

In [25]:
y=df['label']
y

0         1
1         1
2         1
3         1
4         1
         ..
420459    1
420460    1
420461    1
420462    1
420463    1
Name: label, Length: 420464, dtype: int64

## As it is unbalanced Data set trying oversampling

In [26]:
sm=SMOTE(random_state=42)

In [27]:
x1_new,y1_new=sm.fit_resample(x1,y)

In [28]:
x2_new,y2_new=sm.fit_resample(x2,y)

In [29]:
cv=ShuffleSplit(n_splits=10,test_size=0.3,random_state=42)

## Trying different ML algorithms to check accuracy

### Count Vectorizer  without oversampling 

In [30]:
x=cross_val_score(LogisticRegression(max_iter=500),x1,y,cv=cv,scoring='accuracy')
print(x)

[0.97020771 0.97074679 0.97072301 0.97046139 0.96957349 0.97084192
 0.97061202 0.97055652 0.9703504  0.9707785 ]


In [31]:
print(cross_val_score(MultinomialNB(),x1,y,cv=cv,scoring='accuracy'))

[0.92057238 0.91998573 0.92058824 0.92042968 0.92146821 0.9205486
 0.92096876 0.92019978 0.92134929 0.92061995]


### Count_vectorizer with oversampling 

In [32]:
print(cross_val_score(LogisticRegression(max_iter=500),x1_new,y1_new,cv=cv,scoring='accuracy'))

[0.94082932 0.94099365 0.9399448  0.94038464 0.94039431 0.93951463
 0.94046198 0.94009464 0.93996897 0.94042814]


In [33]:
print(cross_val_score(MultinomialNB(),x1_new,y1_new,cv=cv,scoring='accuracy'))

[0.86360583 0.8639055  0.94012847 0.94075198 0.94024931 0.86259081
 0.94024931 0.9399303  0.86378466 0.86328682]


### Tfidf Vectorizer without oversampling

In [34]:
print(cross_val_score(MultinomialNB(),x2,y,cv=cv,scoring='accuracy'))

[0.96208181 0.96286666 0.96285873 0.9626209  0.96159822 0.96239099
 0.96216902 0.96170921 0.96191533 0.96251784]


In [35]:
print(cross_val_score(LogisticRegression(max_iter=500),x2,y,cv=cv,scoring='accuracy'))

[0.95726177 0.95837165 0.95777707 0.95796734 0.9574996  0.95786428
 0.95824481 0.95791184 0.95721421 0.95794355]


### Tfidf Vectorizer with oversampling

In [36]:
print(cross_val_score(MultinomialNB(),x2_new,y2_new,cv=cv,scoring='accuracy'))

[0.98504058 0.98537408 0.98452823 0.98517591 0.98488591 0.98449923
 0.98483274 0.98500674 0.98520008 0.98497774]


In [37]:
print(cross_val_score(LogisticRegression(max_iter=500),x2_new,y2_new,cv=cv,scoring='accuracy'))

[0.98124151 0.98099017 0.98032316 0.98041983 0.98049717 0.9806035
 0.9804295  0.980821   0.98018299 0.98056   ]


## Creating and saving the best model

In [38]:
nb=MultinomialNB()

In [39]:
x_train,x_test,y_train,y_test=train_test_split(x2_new,y2_new,test_size=0.3,random_state=42)

In [40]:
nb.fit(x_train,y_train)

In [41]:
prediction=nb.predict(x_test)

In [42]:
nb.score(x_train,y_train)

0.9965945035618924

In [43]:
confusion_matrix(prediction,y_test)

array([[100491,    320],
       [  2775, 103307]])

In [44]:
print(classification_report(prediction,y_test))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98    100811
           1       1.00      0.97      0.99    106082

    accuracy                           0.99    206893
   macro avg       0.99      0.99      0.99    206893
weighted avg       0.99      0.99      0.99    206893



### Model is 99% accurate