In [None]:
%pip install fasttext

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Loading & preparing data

In [3]:
!unzip /content/drive/MyDrive/Data_files/Porn_data/train.csv.zip

Archive:  /content/drive/MyDrive/Data_files/Porn_data/train.csv.zip
  inflating: train.csv               


In [4]:
!unzip /content/drive/MyDrive/Data_files/Porn_data/test.csv.zip

Archive:  /content/drive/MyDrive/Data_files/Porn_data/test.csv.zip
  inflating: test.csv                


In [5]:
import numpy as np
import pandas as pd

# loading data from CSV to dataframes

train_data = pd.read_csv("/content/train.csv", index_col=0)
test_data = pd.read_csv("/content/test.csv", index_col=0)

In [6]:
split_idx = round(len(train_data) * 0.8)

train_df = train_data.iloc[:split_idx]
val_df = train_data.iloc[split_idx:]
test_df = test_data

In [None]:
# Adding prefix to 'label', required by fasttext

train_df["label_ft"] = train_df["label"].apply(lambda x: "__label__" + str(x))
val_df["label_ft"] = val_df["label"].apply(lambda x: "__label__" + str(x))

In [9]:
train_df.head()

Unnamed: 0_level_0,url,title,label,label_ft
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,m.kp.md,"Экс-министр экономики Молдовы - главе МИДЭИ, ц...",0,__label__0
1,www.kp.by,Эта песня стала известна многим телезрителям б...,0,__label__0
2,fanserials.tv,Банши 4 сезон 2 серия Бремя красоты смотреть о...,0,__label__0
3,colorbox.spb.ru,Не Беси Меня Картинки,0,__label__0
4,tula-sport.ru,В Новомосковске сыграют следж-хоккеисты алекси...,0,__label__0


In [45]:
# A bit of simple preprocessing

import re

train_df["title"] = train_df['title'].apply(lambda x: re.sub('\t', '', str(x)))
train_df["url"] = train_df['url'].apply(lambda x: ' '.join(x.split('.')))
val_df["title"] = val_df['title'].apply(lambda x: re.sub('\t', '', str(x)))
val_df["url"] = val_df['url'].apply(lambda x: ' '.join(x.split('.')))

train_data = train_df[['url', 'title', 'label_ft']]
val_data = val_df[['url', 'title', 'label_ft']]

In [46]:
# Saving data into textfiles for fasttext

train_data.to_csv("train_fasttext.txt", sep='\t', header=None, index=None)
val_data.to_csv("val_fasttext.txt", sep='\t', header=None, index=None)

In [16]:
!cut -f2 train_fasttext.txt | sort | uniq -c

  94903 __label__0
  13344 __label__1


### Training fasttext classifier

In [18]:
import fasttext

In [17]:
def print_results(N, p, r):
    print("Precision\t{:.5f}".format(p))
    print("Recall\t{:.5f}".format(r))
    print("F1\t{:.5f}".format(2*p*r/(p+r)))

In [47]:
%%time
model_big = fasttext.train_supervised(
    input="train_fasttext.txt",
    minCount=3,      # отсеиваем редкие токены
    minn=3, maxn=5,  # диапазон для символьных нграмм
    wordNgrams=4,    # используем словесные нграммы размера 2
    dim=64           # размер вектора
)                    # логи обучения пишутся в терминале

CPU times: user 40.5 s, sys: 1.46 s, total: 42 s
Wall time: 42.3 s


In [48]:
print_results(*model_big.test('val_fasttext.txt'))

Precision	0.996
Recall	0.996
F1	0.996


### Making test predictions

In [49]:
%%time
test_df['title_prep'] = test_df['title'].apply(lambda x: re.sub('\t', '', str(x)))
test_df['label'] = test_df['title_prep'].apply(lambda line: int(model_big.predict(line)[0][0][-1]))
test_df["url"] = test_df['url'].apply(lambda x: ' '.join(x.split('.')))

CPU times: user 9.74 s, sys: 27.5 ms, total: 9.77 s
Wall time: 10.4 s


In [50]:
subm = test_df[['label',]].to_csv("fasttext-subm-3.csv")