In [118]:
import pandas as pd, numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import make_union
from random import randint

In [8]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

In [9]:
lens = train.comment_text.str.len()
lens.mean(), lens.std(), lens.max()

(393.53923958614035, 589.8048967138028, 5000)

In [10]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train['none'] = 1-train[label_cols].max(axis=1)
train.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,none
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805,0.898321
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342,0.302226
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [11]:
COMMENT = 'comment_text'
train[COMMENT].fillna("unknown", inplace=True)
test[COMMENT].fillna("unknown", inplace=True)

In [12]:
train_text = train[COMMENT]
test_text = test[COMMENT]

all_text = pd.concat([train_text, test_text])

In [13]:
import re, string
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

In [14]:
n = train.shape[0]

word_vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1, analyzer='word')

char_vec = TfidfVectorizer(ngram_range=(1,2),
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1, analyzer='char')

vec = make_union(word_vec, char_vec, n_jobs=2)

vec.fit(all_text)

trn_term_doc = vec.fit_transform(train_text)
test_term_doc = vec.transform(test_text)

print ("done")

done


In [1]:
print (test_term_doc.shape)

print (len(test_text))

print (test_term_doc[3, :])

print (vec.shape)

NameError: name 'test_term_doc' is not defined

In [15]:
def pr(y_i, y):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

In [16]:
x = trn_term_doc
test_x = test_term_doc

In [17]:
def get_mdl(y):
    y = y.values
    r = np.log(pr(1,y) / pr(0,y))
    m = LogisticRegression(C=4, dual=True)
    x_nb = x.multiply(r)
    return m.fit(x_nb, y), r

In [18]:
# preds = np.zeros((len(test), len(label_cols)))

# for i, j in enumerate(label_cols):
#     print('fit', j)
#     m,r = get_mdl(train[j])
#     preds[:,i] = m.predict_proba(test_x.multiply(r))[:,1]

In [83]:
preds = np.zeros((len(test), len(label_cols)))

dict = {}

for i, j in enumerate(label_cols):
    print('fit', j)
    m,r = get_mdl(train[j])
    dict[i] = r, m    
    preds[:,i] = m.predict_proba(test_x.multiply(r))[:,1]

print ("done")
    
   

fit toxic
fit severe_toxic
fit obscene
fit threat
fit insult
fit identity_hate
done


In [123]:
print (preds)

[[9.99995380e-01 1.89406422e-01 9.99997026e-01 3.34145931e-03
  9.81878091e-01 1.50718187e-01]
 [3.19719864e-03 1.23643236e-03 2.47297091e-03 8.31324723e-05
  2.06864840e-03 3.11695492e-04]
 [9.39562617e-03 5.43719510e-04 4.00473206e-03 6.80091687e-05
  2.35279719e-03 1.65498091e-04]
 ...
 [2.52696095e-03 1.91127977e-04 7.09842034e-03 4.57447092e-05
  1.61276703e-03 1.75031762e-04]
 [6.06461562e-03 1.61854945e-04 1.51568718e-03 5.69839231e-05
  2.25681416e-03 5.64545968e-04]
 [9.55967932e-01 5.07670559e-05 4.25869040e-01 3.28729651e-04
  5.29740257e-02 4.44772751e-04]]


In [112]:
submission = pd.DataFrame.from_dict({'id': test['id']})

for idx, col in enumerate(label_cols):
    submission[col] = preds[:,idx]
    

submission.to_csv('submission.csv', index=False)
print ("done")

done


In [136]:
string = input("Enter some text to check toxicity: ")

Enter some text to check toxicity: Enter some text to check toxicity: :If you have a look back at the source, the information I updated was the correct form. I can only guess the source hadn't updated. I shall update the information once again but thank you for your message.


In [146]:
print (string)
test_xx = vec.transform(list(string))

print (test_xx)

print (len(test_xx))
print ("done")

Enter some text to check toxicity: :If you have a look back at the source, the information I updated was the correct form. I can only guess the source hadn't updated. I shall update the information once again but thank you for your message.
  (0, 143830)	1.0
  (1, 248760)	1.0
  (2, 349891)	1.0
  (3, 143830)	1.0
  (4, 300846)	1.0
  (6, 316897)	1.0
  (7, 259818)	1.0
  (8, 230547)	1.0
  (8, 429044)	1.0
  (9, 143830)	1.0
  (11, 349891)	1.0
  (12, 143830)	1.0
  (13, 417370)	1.0
  (13, 429882)	1.0
  (14, 349891)	1.0
  (16, 349891)	1.0
  (17, 259818)	1.0
  (19, 104478)	1.0
  (20, 179797)	1.0
  (21, 143830)	1.0
  (22, 104478)	1.0
  (23, 217308)	1.0
  (23, 428882)	1.0
  (25, 349891)	1.0
  (26, 259818)	1.0
  :	:
  (217, 428882)	1.0
  (219, 417554)	1.0
  (219, 429948)	1.0
  (220, 259818)	1.0
  (221, 387990)	1.0
  (223, 156493)	1.0
  (223, 428533)	1.0
  (224, 259818)	1.0
  (225, 300846)	1.0
  (227, 417554)	1.0
  (227, 429948)	1.0
  (228, 259818)	1.0
  (229, 387990)	1.0
  (230, 300846)	1.0
  (232, 

TypeError: sparse matrix length is ambiguous; use getnnz() or shape[0]

In [130]:
predss = np.zeros((1, len(label_cols)))

for i, j in enumerate(label_cols):
    r = dict[i][0]
    m = dict[i][1]
    predss[0,i] = m.predict_proba(test_xx.multiply(r))[0,1]

predSum = predss.sum()
print (sum)
 
y = pd.DataFrame(columns=label_cols)
for idx, col in enumerate(label_cols):
    y.at[0, col] = predss[0,idx]   

print ("Actual value: ", submission.loc[3])  

# 00017563c3f7919a
# 00017563c3f7919a
print (y)


<built-in function sum>
Actual value:  id               00017563c3f7919a
toxic                 0.000975045
severe_toxic          0.000199145
obscene               0.000896116
threat                0.000151842
insult                0.000971429
identity_hate         0.000189095
Name: 3, dtype: object
        toxic severe_toxic     obscene       threat      insult identity_hate
0  0.00238734  5.11974e-05  0.00209207  1.14283e-05  0.00319211    0.00018607


In [None]:
0.003197195	0.001236455	0.00247297	8.31E-05	0.002068648	0.0003117
0.000975044	0.000199152	0.000896114	0.000151879	0.000971434	0.000189096