In [1]:
import torch
import numpy as np
import pandas as pd

from tqdm import trange
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, precision_score, recall_score

from torch import nn
from torch import optim
from torch.utils.data import Dataset, DataLoader

In [2]:
data = pd.read_csv("./jigsaw-toxic-comment-classification-challenge/dataset/train.csv", usecols=list(range(1, 8)))
data.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [3]:
print(data.comment_text[0])

Explanation
Why the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27


In [4]:
# Categories distribution
(data
 .groupby(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'])
 .count()
 .sort_values('comment_text', ascending=False))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,comment_text
toxic,severe_toxic,obscene,threat,insult,identity_hate,Unnamed: 6_level_1
0,0,0,0,0,0,143346
1,0,0,0,0,0,5666
1,0,1,0,1,0,3800
1,0,1,0,0,0,1758
1,0,0,0,1,0,1215
1,1,1,0,1,0,989
1,0,1,0,1,1,618
0,0,1,0,0,0,317
0,0,0,0,1,0,301
1,1,1,0,1,1,265


In [5]:
train_df, valid_df = train_test_split(data, train_size=0.9, random_state=1234)
print(train_df.shape)
print(valid_df.shape)

(143613, 7)
(15958, 7)


In [6]:
train_df

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
158913,"""==snesclassics.com and related spam on Wikipe...",0,0,0,0,0,0
18939,"""\n\ndear Mr Manjunath, i was mearely trying t...",0,0,0,0,0,0
31383,Panic attacks \n\nYou need to cite valid refer...,0,0,0,0,0,0
75238,"""\nThanks for the update, Tony. Your tips on ...",0,0,0,0,0,0
112095,""": I'd buy you a cookie if you took a stab at ...",0,0,0,0,0,0
...,...,...,...,...,...,...,...
41239,"""\n\nSarujo, please change it! Listen http://w...",0,0,0,0,0,0
55985,REDIRECT Talk:Peter Whelan (priest),0,0,0,0,0,0
32399,FOCK THEE SHMUCKY THE CAT GO FOCK WITH KILRATH...,1,0,1,0,1,0
82584,"NOT VANDALISM, YOU F.U.C.K.I.N.G B.I.T.C.H!!!",1,0,1,0,1,0


# Text preprocess: TF-IDF

In [7]:
tf_idf = TfidfVectorizer()
X_train = tf_idf.fit_transform(train_df.comment_text.to_numpy())
X_valid = tf_idf.transform(valid_df.comment_text.to_numpy())



In [8]:
X_train # row: number of comments; column: number of words in all dataset (bag of words)
# X_valid


<143613x177889 sparse matrix of type '<class 'numpy.float64'>'
	with 6251659 stored elements in Compressed Sparse Row format>

In [9]:
# tf_idf.get_feature_names()[-10001]
tf_idf.get_feature_names()[177888] 



'𐌴𐌹'

In [10]:
pca_300 = TruncatedSVD(n_components=300, n_iter=5, random_state=1234)
X_300_train = pca_300.fit_transform(X_train) # training SVD matrix
X_300_valid = pca_300.transform(X_valid) # testing SVD matrix

In [11]:
print (X_300_train.shape)
X_300_valid.shape

(143613, 300)


(15958, 300)

In [12]:
train_df.iloc[:, 1:]

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
158913,0,0,0,0,0,0
18939,0,0,0,0,0,0
31383,0,0,0,0,0,0
75238,0,0,0,0,0,0
112095,0,0,0,0,0,0
...,...,...,...,...,...,...
41239,0,0,0,0,0,0
55985,0,0,0,0,0,0
32399,1,0,1,0,1,0
82584,1,0,1,0,1,0


In [13]:
class TfTrainDataset(Dataset):
    def __init__(self):
        self.x = X_300_train # the TF-IDF matrix
        self.y = train_df.iloc[:, 1:]

    def __getitem__(self, item):
        label = torch.tensor(self.y.iloc[item].tolist(), dtype=torch.float32)
        x = torch.tensor(self.x[item], dtype=torch.float32)
        return x, label

    def __len__(self):
        return len(self.x)


class TfValidDataset(Dataset):
    def __init__(self):
        self.x = X_300_valid
        self.y = valid_df.iloc[:, 1:]

    def __getitem__(self, item):
        label = torch.tensor(self.y.iloc[item].tolist(), dtype=torch.float32)
        x = torch.tensor(self.x[item], dtype=torch.float32)
        return x, label

    def __len__(self):
        return len(self.x)

In [14]:
batch_size = 16
num_classes = 6
num_epochs = 5

trainset = TfTrainDataset()
valset = TfValidDataset()
dataloader = {"train": DataLoader(trainset, shuffle=True, batch_size=batch_size),
              "val": DataLoader(valset, shuffle=True, batch_size=batch_size)}

In [15]:
class FcModel(nn.Module):
    def __init__(self):
        super(FcModel, self).__init__()
        self.sequential = nn.Sequential(nn.Linear(300, 300),
                                        nn.ReLU(),
                                        nn.Linear(300, 150),
                                        nn.ReLU(),
                                        nn.Linear(150, 6))

    def forward(self, x):
        return self.sequential(x)

In [16]:
fc_model = FcModel()
fc_model

FcModel(
  (sequential): Sequential(
    (0): Linear(in_features=300, out_features=300, bias=True)
    (1): ReLU()
    (2): Linear(in_features=300, out_features=150, bias=True)
    (3): ReLU()
    (4): Linear(in_features=150, out_features=6, bias=True)
  )
)

In [17]:
device = "cuda" if torch.cuda.is_available() else "cpu"
fc_model.to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(fc_model.parameters(), lr=0.001)

In [18]:
for epoch in trange(num_epochs, desc="Epochs"):
    result = []
    for phase in ['train', 'val']:
        if phase == "train":  
            fc_model.train()
            optimizer.step()
        else:  
            fc_model.eval()

        running_loss = 0.0
        running_corrects = 0.0

        for data, target in dataloader[phase]:
            data, target = data.to(device), target.to(device)

            with torch.set_grad_enabled(phase == "train"):
                output = fc_model(data)
                loss = criterion(output, target)
                preds = torch.sigmoid(output).data > 0.5
                preds = preds.to(torch.float32)

                if phase == "train":
                    loss.backward()
                    optimizer.step()
                    optimizer.zero_grad()

            running_loss += loss.item() * data.size(0)

        epoch_loss = running_loss / len(dataloader[phase].dataset)
        result.append('{} Loss: {:.4f}'.format(phase, epoch_loss))

Epochs: 100%|██████████| 5/5 [03:49<00:00, 45.87s/it]


In [19]:
result

['train Loss: 0.0484', 'val Loss: 0.0604']

In [53]:
test_x, test_y = valset[100]
test_y

tensor([1., 0., 0., 0., 0., 0.])

In [60]:
len(test_x)

300

In [22]:
# fc_model.to("cpu")
# fc_model.eval()
pred_y = fc_model(test_x)

In [23]:
torch.sigmoid(pred_y).data

tensor([0.8937, 0.0774, 0.5220, 0.0015, 0.7095, 0.3086])

In [24]:
torch.sigmoid(pred_y).data > 0.5

tensor([ True, False,  True, False,  True, False])

In [25]:
test_x

tensor([ 2.9501e-02, -1.4253e-02, -1.0083e-02,  4.9778e-04,  1.5687e-02,
         3.3898e-02, -1.1791e-02, -3.7645e-02,  3.8984e-02, -4.0031e-02,
         5.1021e-02,  2.9936e-02,  7.4644e-03,  2.5222e-02, -6.8358e-03,
         2.1959e-02,  1.8704e-02,  2.0543e-03,  1.1728e-02,  7.4335e-03,
         2.5783e-02, -2.0968e-03, -2.1280e-02, -1.0311e-02, -5.2723e-03,
        -1.3524e-02,  6.4809e-03, -5.7541e-03,  1.9628e-02,  1.1858e-02,
        -7.9657e-03,  4.4858e-03,  9.6199e-04, -1.8739e-02,  2.5760e-02,
        -2.7869e-02,  1.3316e-02, -5.1893e-02,  3.6947e-02, -5.4700e-02,
         3.8573e-02,  8.1892e-02,  4.3651e-03,  3.5652e-02, -3.1598e-02,
        -3.0015e-03,  2.3394e-03, -3.4103e-02, -1.0428e-01, -7.4609e-02,
        -3.2771e-02,  5.8752e-04, -3.1034e-02, -4.2758e-02,  1.4308e-02,
         4.3556e-02,  3.1214e-03,  3.8809e-02,  8.7630e-03, -3.2850e-02,
        -4.8982e-02, -4.6636e-02,  3.3463e-03,  2.6178e-02, -1.9793e-04,
         2.6503e-03, -1.9475e-02, -1.5966e-02, -4.9

# Get the information in gen_model.ipynb

In [26]:
%run gen_model.ipynb


(143613, 7)
(15958, 7)




wachowskis
(41, 6)


In [27]:
w1 = np.argmax(np.random.dirichlet(distr[40], 30), axis=1) # class 0~40

In [28]:
sentence = []
for i in w1:
    sentence.append(tf_idf.get_feature_names_out()[i])

In [29]:
synthetic_s = " ".join(sentence)
synthetic_s

'kiro pragmatics atlantean bestowing affect wackywizzlepenis reasonalbe ofencive grisone snooty 631687958437871616 discussfull internally lankybugger 1891 warmed iceo plumcouch liberalisation horvat transduction kavirajamarga wold negev bowden rhetorically bredolab progenitor amicus pprivate'

In [30]:
# X_valid = tf_idf.transform(valid_df.comment_text.to_numpy()) 
# X_300_valid = pca_300.transform(X_valid)
# class TfValidDataset(): X_300_valid 
# valset = TfValidDataset() 
# test_x = valset[100] 
# pred_y = fc_model(test_x)
# import tensorflow as tf

synthetic_l = [synthetic_s, 1, 1, 1, 1, 1, 1]
# synthetic_l = [str (item) for item in synthetic_l]
synthetic_df = pd.DataFrame([synthetic_l], columns=['comment_text', 'toxic', "severe_toxic", 
                                                  "obscene", "threat", "insult", "identity_hate"])
synthetic = tf_idf.transform(synthetic_df.comment_text.to_numpy())
synthetic_300 = pca_300.transform(synthetic)

type(X_300_valid)
len(X_300_valid)
X_300_valid.shape
synthetic_df
synthetic.shape
synthetic_300.shape



(1, 300)

In [31]:
# synthetic_df
# synthetic_l
# synthetic
synthetic_300.shape
# type(synthetic_300)

(1, 300)

In [32]:
class TfSynDataset(Dataset):
    def __init__(self):
        self.x = synthetic_300
        self.y = synthetic_df.iloc[:, 1:]

    def __getitem__(self, item):
        label = torch.tensor(self.y.iloc[item].tolist(), dtype=torch.float32)
        x = torch.tensor(self.x[item], dtype=torch.float32)
        return x, label

    def __len__(self):
        return len(self.x)

In [33]:
valset_syn = TfSynDataset() 
test_x_syn, test_y_syn = valset_syn[0]
test_x_syn.shape
test_y_syn.shape

test_y_syn
# test_x_syn
# len(test_x_syn)
# len(valset_syn)
# type(valset_syn)
# test_x

tensor([1., 1., 1., 1., 1., 1.])

In [34]:
pred_y_syn = fc_model(test_x_syn)
pred_y_syn
torch.sigmoid(pred_y_syn).data

tensor([0.1431, 0.0036, 0.0635, 0.0006, 0.0360, 0.0072])

In [35]:
torch.sigmoid(pred_y_syn).data > 0.5
# try to have similar dataframe as LDA 171

tensor([False, False, False, False, False, False])

# Train and make predictions based on the Synthetic data from LDA

In [36]:
df_syn = pd.read_csv("./synthetic_LDA.csv")
df_syn.head()

Unnamed: 0.1,Unnamed: 0,0,1,pred_LDA_tpc_1,pred_LDA_tpc_2,pred_LDA_tpc_3,pred_LDA_tpc_4,pred_LDA_tpc_5,pred_LDA_tpc_6,syn_tpc_1,syn_tpc_2,syn_tpc_3,syn_tpc_4,syn_tpc_5,syn_tpc_6
0,0,"[0.0579, 0.7505, 0.0, 0.1916, 0.0, 0.0]","['james', 'present', 'malik', 'photo', 'film',...",0.15241,0.54818,0.031261,0.149621,0.06391,0.054619,0.0579,0.7505,0.0,0.1916,0.0,0.0
1,1,"[0.0159, 0.1895, 0.0, 0.0272, 0.766, 0.0014]","['languages', 'mistakes', 'bad_admin', 'langua...",0.132412,0.215012,0.026278,0.064239,0.521346,0.04071,0.0159,0.1895,0.0,0.0272,0.766,0.0014
2,2,"[0.0077, 0.1646, 0.0275, 0.0397, 0.0376, 0.723]","['hearts', 'sung', 'greater', 'sack', 'chink',...",0.153572,0.120299,0.030348,0.151554,0.125006,0.419216,0.0077,0.1646,0.0275,0.0397,0.0376,0.723
3,3,"[0.1697, 0.1217, 0.7048, 0.0017, 0.0015, 0.0005]","['croatian', 'prick', 'film', 'body', 'little'...",0.24027,0.195757,0.400764,0.070901,0.050093,0.042216,0.1697,0.1217,0.7048,0.0017,0.0015,0.0005
4,4,"[0.8724, 0.0344, 0.0586, 0.0342, 0.0, 0.0005]","['something', 'administrator', 'little', 'mod'...",0.753239,0.013188,0.028092,0.114008,0.049017,0.042455,0.8724,0.0344,0.0586,0.0342,0.0,0.0005


In [37]:
sent_lst_raw = df_syn.iloc[0, 2].strip('][').split(', ')
sent_lst = [word.strip('\'\'') for word in sent_lst_raw]
" ".join(sent_lst)


'james present malik photo film modern present wanker_wanker present eat film wanker_wanker confirmed spam confirmed fat_jew confirmed malik remind raw hanibal unnecessary turkish religious pleasee semite'

In [38]:
df_syn['synthetic_comment'] = "TBA"
for row_ind in range(len(df_syn)):
    sent_lst_raw = df_syn.iloc[row_ind, 2].strip('][').split(', ')
    sent_lst = [word.strip('\'\'') for word in sent_lst_raw]
    df_syn.iloc[row_ind, -1] = " ".join(sent_lst)

df_syn['synthetic_comment']
df_syn

Unnamed: 0.1,Unnamed: 0,0,1,pred_LDA_tpc_1,pred_LDA_tpc_2,pred_LDA_tpc_3,pred_LDA_tpc_4,pred_LDA_tpc_5,pred_LDA_tpc_6,syn_tpc_1,syn_tpc_2,syn_tpc_3,syn_tpc_4,syn_tpc_5,syn_tpc_6,synthetic_comment
0,0,"[0.0579, 0.7505, 0.0, 0.1916, 0.0, 0.0]","['james', 'present', 'malik', 'photo', 'film',...",0.152410,0.548180,0.031261,0.149621,0.063910,0.054619,0.0579,0.7505,0.0000,0.1916,0.0000,0.0000,james present malik photo film modern present ...
1,1,"[0.0159, 0.1895, 0.0, 0.0272, 0.766, 0.0014]","['languages', 'mistakes', 'bad_admin', 'langua...",0.132412,0.215012,0.026278,0.064239,0.521346,0.040710,0.0159,0.1895,0.0000,0.0272,0.7660,0.0014,languages mistakes bad_admin languages somewhe...
2,2,"[0.0077, 0.1646, 0.0275, 0.0397, 0.0376, 0.723]","['hearts', 'sung', 'greater', 'sack', 'chink',...",0.153572,0.120299,0.030348,0.151554,0.125006,0.419216,0.0077,0.1646,0.0275,0.0397,0.0376,0.7230,hearts sung greater sack chink turkish notices...
3,3,"[0.1697, 0.1217, 0.7048, 0.0017, 0.0015, 0.0005]","['croatian', 'prick', 'film', 'body', 'little'...",0.240270,0.195757,0.400764,0.070901,0.050093,0.042216,0.1697,0.1217,0.7048,0.0017,0.0015,0.0005,croatian prick film body little palace rohl pr...
4,4,"[0.8724, 0.0344, 0.0586, 0.0342, 0.0, 0.0005]","['something', 'administrator', 'little', 'mod'...",0.753239,0.013188,0.028092,0.114008,0.049017,0.042455,0.8724,0.0344,0.0586,0.0342,0.0000,0.0005,something administrator little mod proof live ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,115,"[0.1607, 0.299, 0.5345, 0.0, 0.0001, 0.0057]","['prick', 'wanker_wanker', 'huge', 'shovel', '...",0.160478,0.230286,0.403433,0.078970,0.065269,0.061560,0.1607,0.2990,0.5345,0.0000,0.0001,0.0057,prick wanker_wanker huge shovel yall fat_jew m...
116,116,"[0.0433, 0.0831, 0.0021, 0.2235, 0.059, 0.589]","['pride', 'interfering', 'shock', 'shared_ip',...",0.204000,0.070690,0.026505,0.139487,0.146643,0.412674,0.0433,0.0831,0.0021,0.2235,0.0590,0.5890,pride interfering shock shared_ip wwe next for...
117,117,"[0.0335, 0.0244, 0.0233, 0.7902, 0.0941, 0.0345]","['doesnt', 'aint', 'amor', 'motherfucking', 'b...",0.166020,0.013627,0.033512,0.667258,0.071898,0.047682,0.0335,0.0244,0.0233,0.7902,0.0941,0.0345,doesnt aint amor motherfucking bite lying id e...
118,118,"[0.0041, 0.5971, 0.2041, 0.1688, 0.0003, 0.0257]","['turkish', 'wanker_wanker', 'present', 'turki...",0.123534,0.399221,0.130337,0.243468,0.055310,0.048129,0.0041,0.5971,0.2041,0.1688,0.0003,0.0257,turkish wanker_wanker present turkish spam mod...


In [39]:
# synthetic_l = [synthetic_s, 1, 1, 1, 1, 1, 1]
# # synthetic_l = [str (item) for item in synthetic_l]
# synthetic_df = pd.DataFrame([synthetic_l], columns=['comment_text', 'toxic', "severe_toxic", 
#                                                   "obscene", "threat", "insult", "identity_hate"])
synthetic_LDA = tf_idf.transform(df_syn.synthetic_comment.to_numpy())
synthetic_LDA_300 = pca_300.transform(synthetic_LDA)



In [40]:
synthetic_LDA.shape
synthetic_LDA_300.shape

(120, 300)

In [41]:
class TfSynLDADataset(Dataset):
    def __init__(self):
        self.x = synthetic_LDA_300
        self.y = df_syn.iloc[:, 9:15]

    def __getitem__(self, item):
        label = torch.tensor(self.y.iloc[item].tolist(), dtype=torch.float32)
        x = torch.tensor(self.x[item], dtype=torch.float32)
        return x, label

    def __len__(self):
        return len(self.x)

In [42]:
valset_syn_LDA = TfSynLDADataset() 
test_x_syn_LDA, test_y_syn_LDA = valset_syn_LDA[119]

In [43]:
pred_y_syn_LDA = fc_model(test_x_syn_LDA)
pred_y_syn_LDA

tensor([-0.0693, -4.1544, -0.9575, -6.7896, -1.7050, -4.0580],
       grad_fn=<AddBackward0>)

In [44]:
test_y_syn_LDA

tensor([0.0115, 0.0097, 0.2429, 0.0200, 0.0870, 0.6290])

In [45]:
torch.sigmoid(pred_y_syn).data > 0.5
# ?q: why all false? suppose 119  shall have ind=2 & 5 equal to 1

tensor([False, False, False, False, False, False])