In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
import sklearn.metrics as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import langid
import re
import os

In [2]:
darknet = pd.read_csv("./Darknet Data/darknet.csv", encoding='utf-8')
darknet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99952 entries, 0 to 99951
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   url            99952 non-null  object
 1   body_stripped  99946 non-null  object
dtypes: object(2)
memory usage: 1.5+ MB


In [3]:
darknet = darknet.dropna(inplace=False)
darknet.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99946 entries, 0 to 99951
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   url            99946 non-null  object
 1   body_stripped  99946 non-null  object
dtypes: object(2)
memory usage: 2.3+ MB


In [4]:
darknet['language'] = darknet['body_stripped'].apply(lambda x: langid.classify(x)[0])
#filter to english dataset only
english_darknet = darknet[darknet['language']=='en']
english_darknet.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 71650 entries, 0 to 99949
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   url            71650 non-null  object
 1   body_stripped  71650 non-null  object
 2   language       71650 non-null  object
dtypes: object(3)
memory usage: 2.2+ MB


In [5]:
english_darknet.head()

Unnamed: 0,url,body_stripped,language
0,http://3h42ncbglpxvc6e5.onion/disclaimer,\n Apple Market - Stolen & Carded Merchandise...,en
1,http://naturetome2v7rpsvy4ba3cve35y6llpfcpomvj...,\n Talk: Gout — NatureVault \n Talk: Gout \n ...,en
3,http://zqktlwiuavvvqqt4ybvgvi7tyo4hjl5xgfuvpdf...,\n All public logs - The Hidden Wiki \n Help ...,en
4,http://6tn2ejdphoveywwt6pc2sbaez62bytq4vr4xd2f...,\n /b/DNMAdsDenmark | Breaking Bad \n Market ...,en
5,http://mm75rpdxcspr7qee.onion/watch/?v=uQL2vvf...,\n Atheists: Are The Forces of Nature Immate...,en


In [8]:
import tensorflow_hub as hub

nltk.download('punkt')
nltk.download('stopwords')
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

#weird pattern
pattern = r"&(nbsp|#\d+);|\s{2,}"
english_darknet.loc[:, 'body_stripped'] = english_darknet['body_stripped'].apply(lambda x: re.sub(r"\\."," ",x))
english_darknet.loc[:, 'body_stripped'] = english_darknet['body_stripped'].apply(lambda x: re.sub(pattern, " ", x))
english_darknet.loc[:, 'body_stripped'] = english_darknet['body_stripped'].apply(lambda x: re.sub(r"/b/", " ", x))
english_darknet.loc[:, 'body_stripped'] = english_darknet['body_stripped'].apply(lambda x: re.sub(r"https?://[^\s]+|www\S+", " ", x))
english_darknet.loc[:, 'body_stripped'] = english_darknet['body_stripped'].apply(lambda x: re.sub(r"[^\w\s]", " ", x))
english_darknet.loc[:, 'body_stripped'] = english_darknet['body_stripped'].apply(lambda x: re.sub(r"_+", " ", x))
english_darknet.loc[:, 'body_stripped'] = english_darknet['body_stripped'].apply(lambda x: re.sub(r"\d+", " ", x))
english_darknet.loc[:, 'body_stripped'] = english_darknet['body_stripped'].apply(lambda x: re.sub(r"\s+", " ", x))

english_darknet.head()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shrey\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shrey\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[selected_item_labels] = value


Unnamed: 0,url,body_stripped,language
0,http://3h42ncbglpxvc6e5.onion/disclaimer,Apple Market Stolen Carded Merchandise iPhone...,en
1,http://naturetome2v7rpsvy4ba3cve35y6llpfcpomvj...,Talk Gout NatureVault Talk Gout Password requ...,en
3,http://zqktlwiuavvvqqt4ybvgvi7tyo4hjl5xgfuvpdf...,All public logs The Hidden Wiki Help All publ...,en
4,http://6tn2ejdphoveywwt6pc2sbaez62bytq4vr4xd2f...,DNMAdsDenmark Breaking Bad Market Forums a da...,en
5,http://mm75rpdxcspr7qee.onion/watch/?v=uQL2vvf...,Atheists Are The Forces of Nature Immaterial ...,en


In [10]:
english_darknet.to_csv('darknet_english.csv', index=False)

In [2]:
english_darknet = pd.read_csv("./Darknet Data/darknet_english.csv", encoding='utf-8')
english_darknet.head()

Unnamed: 0,url,body_stripped,language
0,http://3h42ncbglpxvc6e5.onion/disclaimer,Apple Market Stolen Carded Merchandise iPhone...,en
1,http://naturetome2v7rpsvy4ba3cve35y6llpfcpomvj...,Talk Gout NatureVault Talk Gout Password requ...,en
2,http://zqktlwiuavvvqqt4ybvgvi7tyo4hjl5xgfuvpdf...,All public logs The Hidden Wiki Help All publ...,en
3,http://6tn2ejdphoveywwt6pc2sbaez62bytq4vr4xd2f...,DNMAdsDenmark Breaking Bad Market Forums a da...,en
4,http://mm75rpdxcspr7qee.onion/watch/?v=uQL2vvf...,Atheists Are The Forces of Nature Immaterial ...,en


In [3]:
# remove giberish words
from nltk.corpus import wordnet
from nltk.corpus import webtext
nltk.download('webtext')

stop_words = stopwords.words('english')
stop_words.extend(['com', 'nbsp'])
lemmatizer = WordNetLemmatizer()

english_words = set(word.lower() for word in wordnet.words())
english_words.update(set(word.lower() for word in webtext.words()))

[nltk_data] Downloading package webtext to
[nltk_data]     C:\Users\shrey\AppData\Roaming\nltk_data...
[nltk_data]   Package webtext is already up-to-date!


In [4]:
# get freq of all words in corpus
from nltk.probability import FreqDist

all_words = [word for text in english_darknet['body_stripped'] for word in text.split()]
all_words_freq = FreqDist(all_words)

In [7]:
def remove_gibberish(text):
    return [word for word in text.split() if word.lower() in english_words or all_words_freq[word] > 3]

def pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return 'a'
    elif nltk_tag.startswith('V'):
        return 'v'
    elif nltk_tag.startswith('R'):
        return 'r'
    else:
        return 'n'
    
#Define function to lemmatize each word with its POS tag
def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text)
    pos_tagged_text = [(word, pos_tagger(pos_tag)) for word, pos_tag in pos_tagged_text]
    return [lemmatizer.lemmatize(word, pos_tag) for word, pos_tag in pos_tagged_text]

def process_text(text):
    text = remove_gibberish(text)
    text = [item for item in text if item not in stop_words and len(item) > 3]
    text = lemmatize_words(text)
    return ' '.join(text)

batch_size = 10000
def process_batch(df):
    df['body_stripped'] = df['body_stripped'].apply(lambda x: process_text(x))
    return df

In [8]:
num_batches = len(english_darknet) // batch_size + 1

#Process in batches
for i in range(num_batches):
    start = i * batch_size
    end = min((i+1) * batch_size, len(english_darknet))
    english_darknet[start:end] = process_batch(english_darknet[start:end])
    print('Batch {} processed'.format(i))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['body_stripped'] = df['body_stripped'].apply(lambda x: process_text(x))


Batch 0 processed
Batch 1 processed
Batch 2 processed
Batch 3 processed
Batch 4 processed
Batch 5 processed
Batch 6 processed
Batch 7 processed


In [9]:
english_darknet.head()

Unnamed: 0,url,body_stripped,language
0,http://3h42ncbglpxvc6e5.onion/disclaimer,Apple Market Stolen Carded Merchandise iPhone ...,en
1,http://naturetome2v7rpsvy4ba3cve35y6llpfcpomvj...,Talk Gout NatureVault Talk Gout Password requi...,en
2,http://zqktlwiuavvvqqt4ybvgvi7tyo4hjl5xgfuvpdf...,public log Hidden Wiki Help public log From Hi...,en
3,http://6tn2ejdphoveywwt6pc2sbaez62bytq4vr4xd2f...,DNMAdsDenmark Breaking Market Forums data data...,en
4,http://mm75rpdxcspr7qee.onion/watch/?v=uQL2vvf...,Atheists Forces Nature Immaterial MGTOW Mirror...,en


In [10]:
import pickle

model = pickle.load(open('./model_MNB/model/RootModel.sav', 'rb'))
tfdf_vect = pickle.load(open('./model_MNB/model/vectorizer.pkl', 'rb'))

def predict(text):
    text = text.lower()
    vector = tfdf_vect.transform([text])
    vector = vector.toarray()
    web_cat = model.predict(vector)[0]
    category = ''
    if web_cat == 0:
        category = 'Adult'
    elif web_cat == 1:
        category = 'Computers and Technology'
    elif web_cat == 2:
        category = 'Financial Crime'
    elif web_cat == 3:
        category = 'Forums'
    elif web_cat == 4:
        category = "Intelligence"
    elif web_cat == 5:
        category = "Law and Government"
    elif web_cat == 6:
        category = "Marketplace"
    elif web_cat == 7:
        category = "Narcotics"
    elif web_cat == 8:
        category = "News"
    else:
        category = "Social Media"
    
    return category

english_darknet['category'] = english_darknet['body_stripped'].apply(lambda x: predict(x))

In [11]:
english_darknet.head(10)

Unnamed: 0,url,body_stripped,language,category
0,http://3h42ncbglpxvc6e5.onion/disclaimer,Apple Market Stolen Carded Merchandise iPhone ...,en,Marketplace
1,http://naturetome2v7rpsvy4ba3cve35y6llpfcpomvj...,Talk Gout NatureVault Talk Gout Password requi...,en,Computers and Technology
2,http://zqktlwiuavvvqqt4ybvgvi7tyo4hjl5xgfuvpdf...,public log Hidden Wiki Help public log From Hi...,en,Marketplace
3,http://6tn2ejdphoveywwt6pc2sbaez62bytq4vr4xd2f...,DNMAdsDenmark Breaking Market Forums data data...,en,Forums
4,http://mm75rpdxcspr7qee.onion/watch/?v=uQL2vvf...,Atheists Forces Nature Immaterial MGTOW Mirror...,en,Computers and Technology
5,http://f2vfjp3jc37gxgn4hum4uf2bhi2w3kp4jbzdweg...,Mystery Tucker Carlson Blog Posts VDARE Please...,en,Social Media
6,http://lockbit7z5ltrhzv46lsg447o3cx2637dloc3qt...,LockBit Leaked Twitter Bitcoin Contact Press A...,en,Computers and Technology
7,http://3bbad7fauom4d6sgppalyqddsqbf5u5p56b5k5u...,SEARCH Loli tuck OnionLand Search This OnionLa...,en,Marketplace
8,http://kovqwkiyxs3sy65u.onion/sk,Prepaid Cards Profit Money Market Fast Honest ...,en,Marketplace
9,http://aivvuuq3hxzpfukwsquy65qw3akq7jvpp5pi4z5...,Register user Hidden Answers Remember Hidden A...,en,Forums


In [21]:
english_darknet.to_csv('darknet_english_cleaned.csv', index=False)