In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Exploratory Data Analysis (or smth like that..)

### 0) Loading data

In [None]:
!unzip /content/drive/MyDrive/Data_files/Porn_data/train.csv.zip

Archive:  /content/drive/MyDrive/Data_files/Porn_data/train.csv.zip
  inflating: train.csv               


In [None]:
!unzip /content/drive/MyDrive/Data_files/Porn_data/test.csv.zip

Archive:  /content/drive/MyDrive/Data_files/Porn_data/test.csv.zip
  inflating: test.csv                


In [None]:
import numpy as np
import pandas as pd

In [None]:
train_df = pd.read_csv("/content/train.csv", index_col=0)
test_df = pd.read_csv("/content/test.csv", index_col=0)

train_df.head(3)

Unnamed: 0_level_0,url,title,label
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,m.kp.md,"Экс-министр экономики Молдовы - главе МИДЭИ, ц...",0
1,www.kp.by,Эта песня стала известна многим телезрителям б...,0
2,fanserials.tv,Банши 4 сезон 2 серия Бремя красоты смотреть о...,0


### 1) Cleaning data

In [None]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 135309 entries, 0 to 135308
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   url     135309 non-null  object
 1   title   135308 non-null  object
 2   label   135309 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 4.1+ MB


In [None]:
train_df[train_df['title'].apply(lambda x: not isinstance(x, str))]

Unnamed: 0_level_0,url,title,label
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
78497,jpg-1.com,,0


In [None]:
print(not any((train_df['url'].apply(lambda x: not isinstance(x, str)))))   # -> all url's are str
print(not any((train_df['label'].apply(lambda x: x not in (0, 1)))))        # -> all labels are 1 or 0

True
True


In [None]:
print(f"all rows: {len(train_df.dropna())}\nrows with NaNs: {len(train_df)}")   # -> only one object containing NaN
train_df = train_df.dropna()

all rows: 135308
rows with NaNs: 135309


In [None]:
print(not any(train_df['title'].apply(lambda x: x == '')))                   # -> no empty titles
print(not any(train_df['url'].apply(lambda x: x == '')))                     # -> no empty urls

True
True


### 2) First-step statistics

In [None]:
# Label distribution on test

train_df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,118593
1,16715


In [None]:
# Here we look at the number of words in titles

title_num_words = train_df['title'].apply(lambda line: len(line.split()))
title_num_words.describe()

Unnamed: 0,title
count,135308.0
mean,10.489727
std,5.399646
min,1.0
25%,7.0
50%,10.0
75%,13.0
max,54.0


##### 2.1) Explore url's a bit more..

In [None]:
# Пробуем сплитить url по точкам и смотреть, сколько чанков получается

url_split_lens = train_df['url'].apply(lambda line: len(line.split('.')))
url_max = url_split_lens.max()
url_min = url_split_lens.min()

print(f"Max '.'-split length: {url_max}, Min '.'-split length: {url_min}")

Max '.'-split length: 11, Min '.'-split length: 1


In [None]:
train_df[train_df['url'].apply(lambda line: len(line.split('.')) == url_max)]

Unnamed: 0_level_0,url,title,label
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
65641,m.m.img1.img1.img1.m.m.m.m.gigporno-video.com,"Порно мне нравится, только горб на стиральной ...",1


In [None]:
min_url_len_rows = train_df[train_df['url'].apply(lambda line: len(line.split('.')) == url_min)]
print(len(min_url_len_rows))

min_url_len_rows   # -> Notice: all 0-labeled

32


Unnamed: 0_level_0,url,title,label
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
16537,login,Платформа Mail.Ru для бизнеса,0
17381,domains,Пользователь a.aborneva@oneproject.kz — Mail.r...,0
18680,domains,Пользователь info@inex.studio — Mail.ru для би...,0
21326,domains,Поиск пользователя «Селетков» — Mail.ru для би...,0
22224,file:,РадиоКот :: Терморегулятор на термопаре К-типа,0
23616,login,Платформа Mail.Ru для бизнеса,0
24687,poslednie-novosti,Откройте для себя новый рынок сбыта благодаря,0
26155,domains,Поиск пользователя «з» — Mail.ru для бизнеса,0
27075,server2,104 Сменный блок для щетки Lilly латунь,0
32578,F:,"Электросхема.Ру - словари, схемы, справочники,...",0


In [None]:
url_lens = train_df['url'].apply(lambda line: len(line.split('.')))
url_lens.describe()

# Видим, что, как правило, URL у нас содержит 2-3 чанка

Unnamed: 0,url
count,135308.0
mean,2.399622
std,0.524617
min,1.0
25%,2.0
50%,2.0
75%,3.0
max,11.0


##### 2.2) Try to tokenize urls

In [None]:
import transformers

model_name = 'DeepPavlov/rubert-base-cased-sentence'
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



In [None]:
url_raw = train_df[:10]
url_raw['tokenized_urls'] = url_raw['url'].apply(lambda x: tokenizer(x)['input_ids'])
url_raw

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  url_raw['tokenized_urls'] = url_raw['url'].apply(lambda x: tokenizer(x)['input_ids'])


Unnamed: 0_level_0,url,title,label,tokenized_urls
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,m.kp.md,"Экс-министр экономики Молдовы - главе МИДЭИ, ц...",0,"[101, 256, 132, 252, 263, 132, 256, 239, 102]"
1,www.kp.by,Эта песня стала известна многим телезрителям б...,0,"[101, 276, 277, 277, 132, 252, 263, 132, 10667..."
2,fanserials.tv,Банши 4 сезон 2 серия Бремя красоты смотреть о...,0,"[101, 12723, 11403, 19662, 12259, 132, 20279, ..."
3,colorbox.spb.ru,Не Беси Меня Картинки,0,"[101, 14648, 22468, 132, 33162, 235, 132, 1399..."
4,tula-sport.ru,В Новомосковске сыграют следж-хоккеисты алекси...,0,"[101, 13567, 5460, 130, 18437, 132, 13995, 102]"
5,beregifiguru.ru,Салат: корейская морковь + копченая курица + к...,0,"[101, 10859, 69380, 14430, 87064, 12192, 132, ..."
6,ekb.vseinstrumenti.ru,"Угловой пневмогайковерт FUBAG RWC 105 1/2"" 100...",0,"[101, 240, 253, 235, 132, 15499, 18404, 49155,..."
7,mirtabaka.org,Табачный магазин `Мир Табака` | КУРИТЕЛЬНЫЕ ТР...,0,"[101, 36589, 10725, 11049, 10883, 132, 11245, ..."
8,xlecx.com,league of legends » Page 5 » Porn comics free ...,1,"[101, 278, 7159, 237, 279, 132, 10724, 102]"
9,bus.biletyplus.ua,"08:00 (4766р), Одесса - Ростов-на-Дону 20 сент...",0,"[101, 19881, 132, 11854, 13142, 69150, 14587, ..."


In [None]:
def check_url_tokenization(df, row_num):
    ids = tokenizer(url_raw.iloc[row_num, 0])['input_ids']
    print(ids)
    print([tokenizer.decode(id) for id in ids])

check_url_tokenization(url_raw, 6)

[101, 240, 253, 235, 132, 15499, 18404, 49155, 58591, 249, 132, 13995, 102]
['[CLS]', 'e', '##k', '##b', '.', 'vs', '##ein', '##str', '##ument', '##i', '.', 'ru', '[SEP]']


### 3) Closer look at sub-dataset with label=1

In [None]:
porn_detected = train_df[train_df['label'] == 1]
porn_detected.sample(3)

Unnamed: 0_level_0,url,title,label
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
49185,nudefauna.com,Flexible Сutie - 116 photos,1
111174,pretty.porn,old mum XXX Videos - Pretty Porn,1
134654,smurfik.net,?? Мастурбация красивой девушки - подборка пор...,1


In [None]:
# Посмотрим, как токенизатор работает на порнушных url'ах

tokenized = []
max_iter = 10
curr_iter = 0

for idx, row in porn_detected.iterrows():
    ids = tokenizer(row['url'])['input_ids']
    tokenized.append([tokenizer.decode(id) for id in ids])
    curr_iter += 1
    if curr_iter == max_iter: break

for li in tokenized:
    print(li)

['[CLS]', 'x', '##le', '##c', '##x', '.', 'com', '[SEP]']
['[CLS]', 'por', '##n', '##mu', '##l', '##t', '.', 'info', '[SEP]']
['[CLS]', '24', '##ero', '##pi', '##xe', '##l', '.', 'net', '[SEP]']
['[CLS]', 'g', '##des', '##pace', '##s', '.', 'com', '[SEP]']
['[CLS]', 'h', '##d', '##x', '##cl', '##ub', '.', 'com', '[SEP]']
['[CLS]', 'j', '##r', '##f', '##z', '##do', '##h', '##kn', '##t', '##mo', '##p', '##ula', '##m', '##56', '##35', '##ay', '##igs', '##e', '##q', '##r', '##47', '##gh', '##pl', '##fa', '##5', '##l', '##67', '##uo', '##72', '##g', '##c', '##n', '##m', '##cs', '##q', '.', 'cd', '##n', '.', 'amp', '##pro', '##ject', '.', 'org', '[SEP]']
['[CLS]', 'des', '##ix', '##xx', '##tu', '##be', '.', 'pro', '[SEP]']
['[CLS]', 'i', '##pad', '.', 'per', '##fe', '##kt', '##dam', '##en', '.', 'co', '[SEP]']
['[CLS]', 'da', '##ft', '##se', '##x', '.', 'com', '[SEP]']
['[CLS]', 'top', '##de', '##v', '##ka', '.', 'com', '[SEP]']
