-
Notifications
You must be signed in to change notification settings - Fork 0
/
classifier.py
33 lines (27 loc) · 930 Bytes
/
classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
from urlparse import urlparse
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.linear_model import LogisticRegression as LR
from sklearn.feature_extraction.text import CountVectorizer
def tokenize(url):
parsed_url = urlparse(url)
tokens = parsed_url.netloc.split('.')
tokens.append(parsed_url.path[1:][:-1].split('/'))
return tokens
class Tokenizer(object):
# def __init__(self):
def __call__(self, doc):
parsed_url = urlparse(doc)
tokens = parsed_url.netloc.split('.')
tokens.append(parsed_url.path[1:][:-1].split('/'))
return tokens
def initialize_classifier():
clf = Pipeline([
('union', FeatureUnion(
transformer_list=[
('vectorizer', CountVectorizer(ngram_range=(1, 4), tokenizer=Tokenizer(), analyzer='char_wb')),
],
)),
('classifier', LR())
])
return clf