In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
from time import time
import matplotlib.pyplot as plt

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\AZAD\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
data = pd.read_csv('website_classification.csv')

In [17]:
data.head()

Unnamed: 0.1,Unnamed: 0,website_url,cleaned_website_text,Category
0,0,https://www.booking.com/index.html?aid=1743217,official site good hotel accommodation big sav...,Travel
1,1,https://travelsites.com/expedia/,expedia hotel book sites like use vacation wor...,Travel
2,2,https://travelsites.com/tripadvisor/,tripadvisor hotel book sites like previously d...,Travel
3,3,https://www.momondo.in/?ispredir=true,cheap flights search compare flights momondo f...,Travel
4,4,https://www.ebookers.com/?AFFCID=EBOOKERS-UK.n...,bot create free account create free account si...,Travel


In [18]:
missing = data.isnull().sum()
missing = missing[missing > 0]
missing.sort_values(inplace=True)

In [19]:
data.drop(['Unnamed: 0'], axis=1, inplace=True)

In [20]:
data.head()

Unnamed: 0,website_url,cleaned_website_text,Category
0,https://www.booking.com/index.html?aid=1743217,official site good hotel accommodation big sav...,Travel
1,https://travelsites.com/expedia/,expedia hotel book sites like use vacation wor...,Travel
2,https://travelsites.com/tripadvisor/,tripadvisor hotel book sites like previously d...,Travel
3,https://www.momondo.in/?ispredir=true,cheap flights search compare flights momondo f...,Travel
4,https://www.ebookers.com/?AFFCID=EBOOKERS-UK.n...,bot create free account create free account si...,Travel


In [21]:
#Data Labeling
lbl = set(data['Category'].values)
lbl = dict(enumerate(lbl,1))
lbl = dict(zip(lbl.values(),lbl.keys()))

In [22]:
label = list(map(lbl.get, list(data['Category'].values)))
data['label'] = pd.Series(label).values
data = data.loc[:, ['website_url','label']]

In [40]:
#Feature Engineering

class FEATURE_ENGINEERING(object):
    def __init__(self, data):
        self.data = data

    def extract(self):
        self.features = []
        feature_text = list(self.data['website_url'].values)

        for t in feature_text:
            if type(t) != str:
                t = t.decode('utf-8').encode('ascii', 'ignore')
            t = re.sub(r'[^a-zA-Z]',r' ',t)

            del_words = ['www','http','com','co','uk','org',
                            'https', 'html', 'ca', 'ee', 'htm',
                            'net', 'edu', 'index', 'asp', 'au', 'nz',
                            'txt', 'php', 'de', 'cgi', 'jp', 'hub',
                            'us', 'fr', 'webs']

            stop_words = set(stopwords.words("english"))
            stop_words.update(del_words)

            text = (i.strip() for i in t.split())
            text = [t for t in text if t not in stop_words]
            text = " ".join(text)

            self.features.append(text)

        return self.features

In [41]:
#Categorical to numerical

class PREPROCESS(object):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y

    def process(self):
        vectorizer = TfidfVectorizer()
        features_train_transformed = vectorizer.fit_transform(self.X)

        selector = SelectPercentile(f_classif, percentile=25)
        selector.fit(features_train_transformed, self.Y)
        features_train = selector.transform(features_train_transformed).toarray()

        return features_train

In [58]:
#Prediction Model

class Classify(object):
    def __init__(self, X, Y):
        self.X = X
        self.Y = np.asarray(Y)
        self.final_accuracy = []
        self.NFOLDS = 10
        self.kf = KFold(n_splits=self.NFOLDS, shuffle=True)

    def Support_Vector_Machine(self):
        for self.FOLD_NO, (train_index, test_index) in enumerate(self.kf.split(self.X)):
            X_train = self.X[train_index]
            X_test = self.X[test_index]
            y_train = self.Y[train_index]
            y_test = self.Y[test_index]

            clf = clf = SVC(kernel='linear')
            clf.fit(X_train, y_train)
            start_time = time()
            accuracy = clf.score(X_train, y_train)
            self.final_accuracy.append(accuracy)
            print ("training time:", round(time()-start_time, 2), "secs")
            start_time2 = time()
            prediction = clf.predict(X_test)

        print ("predict time:", round(time()-start_time2, 2), "secs")
        target_names = ['Education', 'Business/Corporate', 'Travel', 'Streaming Services', 'Sports', 'E-Commerce', 'Games', 'News', 'Health and Fitness', 'Computers and Technology', 'Photography', 'Food', 'Law and Government', 'Social Networking and Messaging', 'Adult', 'Forums']
        print(classification_report(y_test, prediction, target_names=target_names, labels=[0,1])) 
        print(confusion_matrix(y_test, prediction))
        print(precision_recall_fscore_support(y_test, prediction, average='weighted'))

        print('\nCV accuracy: %.3f +/- %.3f' % (np.mean(self.final_accuracy), np.std(self.final_accuracy)))

    def Random_Forest(self):
        for self.FOLD_NO, (train_index, test_index) in enumerate(self.kf.split(self.X)):
            X_train = self.X[train_index]
            X_test = self.X[test_index]
            y_train = self.Y[train_index]
            y_test = self.Y[test_index]

            clf = RandomForestClassifier(n_estimators=10, max_depth=2)
            clf.fit(X_train, y_train)
            start_time = time()
            accuracy = clf.score(X_train, y_train)
            self.final_accuracy.append(accuracy)
            print ("training time:", round(time()-start_time, 2), "secs")
            start_time2 = time()
            prediction = clf.predict(X_test)

        print ("predict time:", round(time()-start_time2, 2), "secs")
        target_names = ['Education', 'Business/Corporate', 'Travel', 'Streaming Services', 'Sports', 'E-Commerce', 'Games', 'News', 'Health and Fitness', 'Computers and Technology', 'Photography', 'Food', 'Law and Government', 'Social Networking and Messaging', 'Adult', 'Forums']
        print(classification_report(y_test, prediction, target_names=target_names, labels=[0,1]))
        print(confusion_matrix(y_test, prediction))
        print(precision_recall_fscore_support(y_test, prediction, average='weighted'))

        print('\nCV accuracy: %.3f +/- %.3f' % (np.mean(self.final_accuracy), np.std(self.final_accuracy)))

    def Logistic_Regression(self):
        for self.FOLD_NO, (train_index, test_index) in enumerate(self.kf.split(self.X)):
            X_train = self.X[train_index]
            X_test = self.X[test_index]
            y_train = self.Y[train_index]
            y_test = self.Y[test_index]

            clf = LogisticRegression()
            clf.fit(X_train, y_train)
            start_time = time()
            accuracy = clf.score(X_train, y_train)
            self.final_accuracy.append(accuracy)
            print ("training time:", round(time()-start_time, 2), "secs")
            start_time2 = time()
            prediction = clf.predict(X_test)

        print ("predict time:", round(time()-start_time2, 2), "secs")
        target_names = ['Education', 'Business/Corporate', 'Travel', 'Streaming Services', 'Sports', 'E-Commerce', 'Games', 'News', 'Health and Fitness', 'Computers and Technology', 'Photography', 'Food', 'Law and Government', 'Social Networking and Messaging', 'Adult', 'Forums']
        print(classification_report(y_test, prediction, target_names=target_names, labels=[0,1]))
        print(confusion_matrix(y_test, prediction))
        print(precision_recall_fscore_support(y_test, prediction, average='weighted'))

        print('\nAccuracy: %.3f +/- %.3f' % (np.mean(self.final_accuracy), np.std(self.final_accuracy)))

In [59]:
features = FEATURE_ENGINEERING(data).extract()
features_train = PREPROCESS(features, label).process()

classify_SVM = Classify(features_train, label).Support_Vector_Machine()
classify_RF = Classify(features_train, label).Random_Forest()
classify_LR = Classify(features_train, label).Logistic_Regression()

training time: 0.85 secs
training time: 0.89 secs
training time: 0.85 secs
training time: 1.19 secs
training time: 1.08 secs
training time: 0.9 secs
training time: 0.86 secs
training time: 0.88 secs
training time: 0.91 secs
training time: 0.91 secs
predict time: 0.1 secs
                                 precision    recall  f1-score   support

                      Education       0.00      0.00      0.00         0
             Business/Corporate       0.67      0.22      0.33         9

                      micro avg       0.67      0.22      0.33         9
                      macro avg       0.33      0.11      0.17         9
                   weighted avg       0.67      0.22      0.33         9

[[ 2  0  0  0  0  0  0  0  0  0  0  7  0  0  0]
 [ 0  0  0  0  0  1  1  0  0  0  0 10  0  0  0]
 [ 0  0 11  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  2  0  0  0  0  0  0  0  5  0  0  0]
 [ 0  0  0  0  1  0  0  0  0  0  0  7  0  0  0]
 [ 0  0  0  0  0  4  0  0  0  0  0  4  0  0  0]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


training time: 0.01 secs
training time: 0.01 secs
training time: 0.01 secs
training time: 0.02 secs
training time: 0.01 secs
training time: 0.01 secs
training time: 0.01 secs
training time: 0.02 secs
training time: 0.01 secs
predict time: 0.0 secs
                                 precision    recall  f1-score   support

                      Education       0.00      0.00      0.00         0
             Business/Corporate       1.00      0.08      0.15        12

                      micro avg       1.00      0.08      0.15        12
                      macro avg       0.50      0.04      0.08        12
                   weighted avg       1.00      0.08      0.15        12

[[ 1  0  0  0  0  0 11  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0 13  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0 10  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0 12  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  8  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0 14  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  8  0  0  0  0  0  0 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


training time: 0.02 secs
training time: 0.01 secs
training time: 0.0 secs
training time: 0.0 secs
training time: 0.01 secs
training time: 0.0 secs
training time: 0.01 secs
training time: 0.0 secs
training time: 0.0 secs
training time: 0.0 secs
predict time: 0.0 secs
                                 precision    recall  f1-score   support

                      Education       0.00      0.00      0.00         0
             Business/Corporate       1.00      0.20      0.33        10

                      micro avg       1.00      0.20      0.33        10
                      macro avg       0.50      0.10      0.17        10
                   weighted avg       1.00      0.20      0.33        10

[[ 2  0  0  0  0  0  0  0  0  0  0  0  8  0  0  0]
 [ 0  1  0  0  0  1  1  0  0  0  1  0 11  0  0  0]
 [ 0  0  7  0  0  0  0  0  0  0  0  0  3  0  0  0]
 [ 0  0  0  3  0  0  0  0  0  0  0  0  5  0  0  0]
 [ 0  0  0  0  1  0  0  0  0  0  0  0 10  0  0  0]
 [ 0  0  0  0  0  1  0  0  1  0  0  0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
