In [2]:
%run tools
from tools import *
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
import requests
from bs4 import BeautifulSoup
import re
import nltk
import unicodedata
from nltk.corpus import stopwords
import glob
import time

In [2]:
# carregando a base de dados
df = load_database()
# criando o modelo
vocabulary, model = bag_of_words(df['content'].values)

In [3]:
X = vocabulary.toarray()
Y = df.label.values

classifier = LogisticRegression(solver='lbfgs')
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)
#Treinar o classificador
classifier.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [4]:
#Rodar no conjunto de teste
predict_labels = classifier.predict(X_test)
#Avaliar o classificador
evaluate(Y_test, predict_labels)

[0.8309859154929577, 0.84375, 0.7941176470588235, 0.8181818181818182]

In [4]:
STOPWORDS = set(stopwords.words("portuguese"))
USER_AGENT = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
              'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'}

def clean(text):
    nfkd = unicodedata.normalize('NFKD', text)
    word = nfkd.encode('ASCII', 'ignore').decode('ASCII')
    d = re.sub("[^a-zA-Z ]", "", word)
    words = d.lower().split()
    words = [w for w in words if w not in STOPWORDS]
    words = ' '.join(words)
    return words

def filter_tag(content):
    if content.parent.name in ["head", "title", "style", "script", "[document]"]:
        return False
    elif re.match('<!--.*-->', str(content.encode('utf-8'))):
        return False
    else:
        return True

def content_page(url):
    req = requests.get(url, headers=USER_AGENT)
    content = req.text
    s = BeautifulSoup(content, 'html.parser')
    data = s.find_all(text=True)
    data = ''.join(filter(filter_tag, data))
    text = data.replace('\n', ' ').replace('\r', '').strip()
    text = ' '.join(text.split())
    return clean(text)

In [6]:
def classify_page(page):
    #Pegar o content de page
    text = content_page(page)
    #Rodar o classificador treinado para esta página
    features = model.transform([text])
    return classifier.predict(features.toarray())[0]

In [7]:
bfs = glob.glob("../Crawler/Bfs/*.csv")
heuristc = glob.glob("../Crawler/Heuristic/*.csv")
crawlers = [bfs, heuristc]

In [27]:
for i, crawler in enumerate(crawlers):
    t = "bfs" if i == 0 else "heuristic"
    for csv in crawler:
        results = []
        df = pd.read_csv(csv).transpose()
        df = df.reset_index()
        df.columns = ["Links"]
        store = csv.split("/")[3]

        for index, line in enumerate(df.values):
            print("Classificando " + line[0])
            print((index / df.shape[0]) * 100, "%")
            try:
                results.append([line[0], classify_page(line[0])])
                time.sleep(1)
            except:
                print("Deu pau em: ", line)

            dataframe = pd.DataFrame(results, columns=["url", "label"])
            dataframe.to_csv(path_or_buf=f"../Results/re_{t}_{store}", index=False)

In [3]:
heuristcs = glob.glob("../Results/Heuristic/*.csv")
frames = []
for csv in heuristcs:
    frames.append(pd.read_csv(csv))
dataset = pd.concat(frames)
dataset = dataset[dataset["label"] == 1]
dataset = dataset.reset_index().drop(["index"], axis=1)
dataset.to_csv(path_or_buf=f"../Results/positive_docs.csv", index=True)

In [11]:
url_re = '((www\.)?[a-zA-Z0-9]+\.[a-zA-Z0-9]+(\.[a-zA-Z0-9]+)*)'
links = dataset["url"].values
for id, link in enumerate(links):
    title = re.findall(url_re, link)[0][0] + "_" + str(id) + ".html"
    try:
        req = requests.get(link, headers=USER_AGENT)
        print("Baixando: ", title)
        with open(title, "w") as f:
            f.write(req.text)
            time.sleep(1)
    except:
        !cd pages/ && curl -o {title} {link} && cd -