In [34]:
import pandas as pd
import numpy as np
import time

import seaborn as sns
import matplotlib.pyplot as plt

from ydata_profiling import ProfileReport
%matplotlib inline

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split

In [36]:
data_df_train = pd.read_csv('train.csv')
data_df_train.drop('id', axis=1, inplace=True)
data_df_train

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...
159566,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [37]:
import pandas as pd
from sklearn.utils import resample

In [38]:
data_df_train['combined_labels'] = data_df_train.iloc[:, 1:].apply(lambda row: ','.join(map(str, row)), axis=1)

label_counts = data_df_train['combined_labels'].value_counts()

max_count = label_counts.max()

resampled_data = []

for label in label_counts.index:
    label_data = data_df_train[data_df_train['combined_labels'] == label]
    if len(label_data) < max_count:
        label_data_resampled = resample(label_data,
                                        replace=True,        
                                        n_samples=max_count, 
                                        random_state=42)     
    else:
        label_data_resampled = label_data
    resampled_data.append(label_data_resampled)

data_df_train_balanced = pd.concat(resampled_data)

data_df_train_balanced = data_df_train_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

balanced_label_counts = data_df_train_balanced['combined_labels'].value_counts()
print("Balanced label counts:\n", balanced_label_counts)

Balanced label counts:
 combined_labels
0,0,0,0,1,1    143346
1,1,1,0,0,1    143346
1,1,1,0,1,0    143346
0,0,1,0,1,0    143346
1,1,1,1,0,0    143346
1,0,0,0,0,0    143346
1,1,0,0,1,0    143346
1,1,0,1,0,0    143346
1,0,1,0,1,1    143346
0,0,1,1,0,0    143346
1,0,1,0,1,0    143346
1,0,0,0,0,1    143346
1,1,1,1,1,1    143346
0,0,0,1,1,0    143346
0,0,0,0,0,0    143346
0,0,1,0,0,0    143346
1,0,0,1,1,0    143346
0,0,0,0,0,1    143346
0,0,1,1,1,0    143346
0,0,0,1,0,0    143346
1,1,0,0,1,1    143346
1,0,0,1,1,1    143346
1,1,0,0,0,1    143346
1,1,0,1,1,0    143346
1,0,1,1,1,1    143346
1,0,0,0,1,0    143346
1,1,0,0,0,0    143346
1,0,1,1,0,0    143346
0,0,1,0,1,1    143346
1,0,1,0,0,0    143346
0,0,0,0,1,0    143346
1,0,0,1,0,1    143346
1,1,1,0,1,1    143346
1,1,0,1,0,1    143346
1,0,1,0,0,1    143346
1,0,0,0,1,1    143346
1,0,1,1,1,0    143346
0,0,1,0,0,1    143346
1,0,0,1,0,0    143346
1,1,1,1,1,0    143346
1,1,1,0,0,0    143346
Name: count, dtype: int64


In [39]:
train, test = train_test_split(data_df_train_balanced, test_size=0.2, random_state=42, stratify=data_df_train_balanced.combined_labels)

print(train.shape, test.shape)

(4701748, 8) (1175438, 8)


In [40]:
tfidf = TfidfVectorizer(max_features=1000).fit(data_df_train_balanced.comment_text)
len(tfidf.vocabulary_)

1000

In [41]:
X_train = tfidf.transform(train.comment_text)
print('X_train shape: %s' % (X_train.shape,))
X_train.data = X_train.data.astype(np.float16)

X_test = tfidf.transform(test.comment_text)
print('X_test shape: %s' % (X_test.shape,))
X_test.data = X_test.data.astype(np.float16)

X_train shape: (4701748, 1000)
X_test shape: (1175438, 1000)


In [42]:
label_encoder = LabelEncoder()

label_encoder.fit(data_df_train_balanced['combined_labels'])

In [43]:
label_encoder.classes_

array(['0,0,0,0,0,0', '0,0,0,0,0,1', '0,0,0,0,1,0', '0,0,0,0,1,1',
       '0,0,0,1,0,0', '0,0,0,1,1,0', '0,0,1,0,0,0', '0,0,1,0,0,1',
       '0,0,1,0,1,0', '0,0,1,0,1,1', '0,0,1,1,0,0', '0,0,1,1,1,0',
       '1,0,0,0,0,0', '1,0,0,0,0,1', '1,0,0,0,1,0', '1,0,0,0,1,1',
       '1,0,0,1,0,0', '1,0,0,1,0,1', '1,0,0,1,1,0', '1,0,0,1,1,1',
       '1,0,1,0,0,0', '1,0,1,0,0,1', '1,0,1,0,1,0', '1,0,1,0,1,1',
       '1,0,1,1,0,0', '1,0,1,1,1,0', '1,0,1,1,1,1', '1,1,0,0,0,0',
       '1,1,0,0,0,1', '1,1,0,0,1,0', '1,1,0,0,1,1', '1,1,0,1,0,0',
       '1,1,0,1,0,1', '1,1,0,1,1,0', '1,1,1,0,0,0', '1,1,1,0,0,1',
       '1,1,1,0,1,0', '1,1,1,0,1,1', '1,1,1,1,0,0', '1,1,1,1,1,0',
       '1,1,1,1,1,1'], dtype=object)

In [44]:
Y_train = label_encoder.transform(train.combined_labels).reshape(-1, 1)
print('Y_train shape: %s' % (Y_train.shape,))

Y_test = label_encoder.transform(test.combined_labels).reshape(-1, 1)
print('Y_test shape: %s' % (Y_test.shape,))

Y_train shape: (4701748, 1)
Y_test shape: (1175438, 1)


In [45]:
from sklearn.cluster import KMeans

#subset_size = 1000000

X_train_subset_sparse = X_train

X_train_subset_sparse.data = X_train_subset_sparse.data.astype(np.float16)

inertias = []

for i in range(1,52):
    kmeans = KMeans(n_clusters=i)
    kmeans.fit(X_train_subset_sparse)
    inertias.append(kmeans.inertia_)

plt.plot(range(1,52), inertias, marker='o')
plt.title('Elbow method')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()

KeyboardInterrupt: 

In [None]:
kmeans = KMeans(n_clusters=2)
kmeans.fit(data)

plt.scatter(X_train, Y_train, c=kmeans.labels_)
plt.show()

In [None]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB().fit(X_train, Y_train)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(Y_train, clf.predict(X_train), target_names=label_encoder.classes_))