In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
import sklearn.metrics as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import langid
import re
import os

In [13]:
darknet = pd.read_csv("darknet.csv", encoding='utf-8')
darknet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99952 entries, 0 to 99951
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   url            99952 non-null  object
 1   body_stripped  99946 non-null  object
dtypes: object(2)
memory usage: 1.5+ MB


In [14]:
darknet = darknet.dropna(inplace=False)
darknet.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99946 entries, 0 to 99951
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   url            99946 non-null  object
 1   body_stripped  99946 non-null  object
dtypes: object(2)
memory usage: 2.3+ MB


In [15]:
darknet['language'] = darknet['body_stripped'].apply(lambda x: langid.classify(x)[0])
#filter to english dataset only
english_darknet = darknet[darknet['language']=='en']
english_darknet.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 71650 entries, 0 to 99949
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   url            71650 non-null  object
 1   body_stripped  71650 non-null  object
 2   language       71650 non-null  object
dtypes: object(3)
memory usage: 2.2+ MB


In [16]:
english_darknet.head()

Unnamed: 0,url,body_stripped,language
0,http://3h42ncbglpxvc6e5.onion/disclaimer,\n Apple Market - Stolen & Carded Merchandise...,en
1,http://naturetome2v7rpsvy4ba3cve35y6llpfcpomvj...,\n Talk: Gout — NatureVault \n Talk: Gout \n ...,en
3,http://zqktlwiuavvvqqt4ybvgvi7tyo4hjl5xgfuvpdf...,\n All public logs - The Hidden Wiki \n Help ...,en
4,http://6tn2ejdphoveywwt6pc2sbaez62bytq4vr4xd2f...,\n /b/DNMAdsDenmark | Breaking Bad \n Market ...,en
5,http://mm75rpdxcspr7qee.onion/watch/?v=uQL2vvf...,\n Atheists: Are The Forces of Nature Immate...,en


In [18]:
import tensorflow_hub as hub

nltk.download('punkt')
nltk.download('stopwords')
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

#weird pattern
pattern = r"&(amp;)?|&#\d+;|\s{2,}"
english_darknet.loc[:, 'body_stripped'] = english_darknet['body_stripped'].apply(lambda x: re.sub(r"\\."," ",x))
english_darknet.loc[:, 'body_stripped'] = english_darknet['body_stripped'].apply(lambda x: re.sub(pattern, " ", x))
english_darknet.loc[:, 'body_stripped'] = english_darknet['body_stripped'].apply(lambda x: re.sub(r"/b/", " ", x))
english_darknet.loc[:, 'body_stripped'] = english_darknet['body_stripped'].apply(lambda x: re.sub(r"https?://[^\s]+|www\S+", " ", x))
english_darknet.loc[:, 'body_stripped'] = english_darknet['body_stripped'].apply(lambda x: re.sub(r"[^\w\s]", " ", x))
english_darknet.loc[:, 'body_stripped'] = english_darknet['body_stripped'].apply(lambda x: re.sub(r"\d+", " ", x))
english_darknet.loc[:, 'body_stripped'] = english_darknet['body_stripped'].apply(lambda x: re.sub(r"\s+", " ", x))

english_darknet.head()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shrey\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shrey\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[selected_item_labels] = value


Unnamed: 0,url,body_stripped,language
0,http://3h42ncbglpxvc6e5.onion/disclaimer,Apple Market Stolen Carded Merchandise iPhone...,en
1,http://naturetome2v7rpsvy4ba3cve35y6llpfcpomvj...,Talk Gout NatureVault Talk Gout Password requ...,en
3,http://zqktlwiuavvvqqt4ybvgvi7tyo4hjl5xgfuvpdf...,All public logs The Hidden Wiki Help All publ...,en
4,http://6tn2ejdphoveywwt6pc2sbaez62bytq4vr4xd2f...,DNMAdsDenmark Breaking Bad Market Forums a da...,en
5,http://mm75rpdxcspr7qee.onion/watch/?v=uQL2vvf...,Atheists Are The Forces of Nature Immaterial ...,en


In [19]:
english_darknet.to_csv('darknet_english.csv', index=False)

In [8]:
english_darknet['body_stripped'] = english_darknet['body_stripped'].apply(lambda x: word_tokenize(x))
english_darknet['body_stripped'] = english_darknet['body_stripped'].apply(lambda x: [item for item in x if item not in stop_words])
english_darknet['body_stripped'] = english_darknet['body_stripped'].apply(lambda x: [item for item in x if len(item)>2])
english_darknet['body_stripped'] = english_darknet['body_stripped'].apply(lambda x: [lemmatizer.lemmatize(y) for y in x])
english_darknet['body_stripped'] = english_darknet['body_stripped'].apply(lambda x: ' '.join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  english_darknet['body_stripped'] = english_darknet['body_stripped'].apply(lambda x: word_tokenize(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  english_darknet['body_stripped'] = english_darknet['body_stripped'].apply(lambda x: [item for item in x if item not in stop_words])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#retu

In [11]:
import pickle

vectorizer = 'model_MNB/model/vectorizer.pkl'
selector = 'model_MNB/model/selector.pkl'

model = pickle.load(open('model_MNB/model/RootModel.sav','rb'))
tf_id_vectorizer = pickle.load(open(vectorizer,'rb'))

def vertorize_text(text, tf_id_vectorizer, chi2_selector):
    vector = tf_id_vectorizer.transform([text])
    if chi2_selector != None:
        vector = chi2_selector.transform(vector)
    vector = vector.toarray()
    return vector

for index,row in english_darknet.iterrows():
    vector = vertorize_text(row['body_stripped'], tf_id_vectorizer, None)
    prediction = model.predict(vector)
    print(prediction[0])
    #english_darknet.at[index, 'prediction'] = prediction[0]

6
1
6
3
1
9
1
6
6
3
0
3
9
1
1
9
1
8
3
7
2
1
6
1
1
3
0
5
6
3
8
1
0
0
0
1
0
0
6
0
1
1
0
6
6
3
9
3
9
3
1
0
0
1
0
0
1
2
8
4
6
0
1
4
3
8
1
0
0
3
6
7
9
0
1
6
1
1
1
4
9
6
1
6
6
1
3
1
7
2
6
9
3
1
0
4
1
0
6
7
1
0
2
1
6
1
4
0
0
0
3
9
1
1
1
8
4
2
9
0
1
1
3
0
1
6
1
3
1
6
2
3
1
1
6
1
2
8
3
1
1
9
1
0
1
0
6
6
1
0
9
2
0
8
0
2
3
1
2
1
0
7
1
9
0
9
1
9
9
1
6
6
1
7
6
0
1
9
2
7
0
0
7
1
2
7
8
0
0
0
8
1
6
3
0
1
1
0
6
6
9
0
1
1
1
3
9
8
0
0
1
1
1
1
1
1
7
9
1
1
1
0
1
1
1
6
2
1
1
1
1
7
0
0
0
6
4
8
0
2
1
6
9
7
1
9
3
5
0
6
0
5
6
1
3
1
0
6
1
4
3
1
1
1
0
6
1
1
1
0
0
0
7
1
1
1
7
6
3
1
1
0
1
6
3
1
0
1
3
9
6
9
6
1
9
6
1
1
4
0
1
9
0
6
6
1
9
6
9
8
6
8
0
3
6
0
8
1
1
6
1
6
9
1
1
1
6
6
8
1
0
3
5
6
0
5
2
6
3
1
0
0
1
1
6
0
3
2
1
9
1
1
2
1
7
1
7
9
1
2
7
0
1
2
1
6
8
1
3
3
5
3
0
9
0
8
0
0
1
9
6
6
1
8
0
9
1
2
6
7
1
6
9
6
6
7
7
1
8
0
1
3
1
3
1
1
6
7
3
1
7
0
1
1
1
1
1
6
1
6
6
0
8
6
1
9
1
0
1
2
9
1
6
3
1
6
1
9
4
6
3
9
4
6
3
3
3
3
6
7
1
6
1
1
6
8
6
3
2
3
8
6
6
0
6
1
1
0
1
1
2
5
1
1
1
9
1
1
2
7
8
0
7
1
1
3
9
2
1
6
1
3
0
2
0
6
0
3
0
1


KeyboardInterrupt: 

In [10]:
english_darknet.to_csv('darknet_english.csv', index=False)