In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


In [2]:
train = pd.read_csv('~/Desktop/train.csv')
test = pd.read_csv('~/Desktop/test.csv')

In [3]:
train['len']=train['comment_text'].apply(lambda x:len(x))

In [4]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [5]:
COMMENT = 'comment_text'
train[COMMENT].fillna("unknown", inplace=True)
test[COMMENT].fillna("unknown", inplace=True)

In [6]:
from nltk.corpus import stopwords
import string
stopwords.words('english')[0:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [7]:
def text_process(mess):

    nopunc = [char for char in mess if char not in string.punctuation]

    nopunc = ''.join(nopunc)
   
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
bow_transformer = CountVectorizer(analyzer=text_process).fit(train['comment_text'])

print(len(bow_transformer.vocabulary_))

214990


In [10]:
messages_train = bow_transformer.transform(train['comment_text'])

In [13]:
print('Shape of Sparse Matrix: ', messages_train.shape)
print('Amount of Non-Zero occurences: ', messages_train.nnz)

Shape of Sparse Matrix:  (95851, 214990)
Amount of Non-Zero occurences:  2754941


In [15]:
sparsity = (100.0 * messages_train.nnz / (messages_train.shape[0] * messages_train.shape[1]))
print('sparsity: {}'.format(round(sparsity)))

sparsity: 0


In [16]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer().fit(messages_train)


In [17]:
messages_tfidf = tfidf_transformer.transform(messages_train)
print(messages_tfidf.shape)

(95851, 214990)


In [20]:
messages_test = bow_transformer.transform(test['comment_text'])

In [21]:
messages_tfidf_test = tfidf_transformer.transform(messages_test)
print(messages_tfidf_test.shape)

(226998, 214990)


In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
logreg = LogisticRegression(C=9.0)

submission_binary = pd.read_csv('~/Desktop/sample_submission.csv')

for label in label_cols:
    print('... Processing {}'.format(label))
    y = train[label]
    
    logreg.fit(messages_tfidf, y)
 
    y_pred_X = logreg.predict(messages_tfidf)
    
    print('Training accuracy is {}'.format(accuracy_score(y, y_pred_X)))

    test_y_prob = logreg.predict_proba(messages_tfidf_test)[:,1]
    
    submission_binary[label] = test_y_prob


... Processing toxic
Training accuracy is 0.9885447204515342
... Processing severe_toxic
Training accuracy is 0.9957746919698282
... Processing obscene
Training accuracy is 0.9933438357450627
... Processing threat
Training accuracy is 0.9984455039592701
... Processing insult
Training accuracy is 0.9909025466609633
... Processing identity_hate
Training accuracy is 0.9965362907011924


In [23]:
submission_binary.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,6044863,0.019406,0.003014,0.012449,0.001547,0.011991,0.003357
1,6102620,0.001047,0.000385,0.000768,0.000385,0.000372,0.000511
2,14563293,0.001278,0.000148,0.000624,0.000148,0.000764,0.000175
3,21086297,0.023672,0.002375,0.004459,0.001248,0.008487,0.001884
4,22982444,0.04302,0.004289,0.014856,0.001575,0.0185,0.003968


In [25]:
from IPython.display import FileLink


submission_binary.to_csv('kagsubm.csv',index=False)
FileLink('kagsubm.csv')