In [16]:
import configparser
import pandas as pd
import re
# import and instantiate TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
# import and instantiate the Logistic Regression model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

config = configparser.ConfigParser()
config.read('./config.cfg')
train_df = pd.read_csv(config['FILES']['TRAIN'])
test_df = pd.read_csv(config['FILES']['TEST'])
print(test_df.shape)
print(test_df['id'][0:5])

test_label = pd.read_csv(config['FILES']['LABEL'])
print(test_label.shape)
print(test_label['id'][0:5])

cols_name = ['obscene', 'insult', 'toxic', 'severe_toxic', 'identity_hate', 'threat']





0    00001cee341fdb12
1    0000247867823ef7
2    00013b17ad220c46
3    00017563c3f7919a
4    00017695ad8997eb
Name: id, dtype: object
(153164, 2)
(153164, 7)
0    00001cee341fdb12
1    0000247867823ef7
2    00013b17ad220c46
3    00017563c3f7919a
4    00017695ad8997eb
Name: id, dtype: object


In [17]:
def clean_text(text):
    text = text.lower()
    for change in [(r"what's", "what is "), (r"\'s", " "), (r"\'ve", " have "), (r"can't", "cannot "), 
                   (r"n't", " not "), (r"i'm", "i am "), (r"\'re", " are "), (r"\'d", " would "), (r"\'ll", " will "), (r"\'scuse", " excuse ")]:
        text = re.sub(change[0], change[1], text)
    text = text.strip(' ')
    return text


train_df['comment_text'] = train_df['comment_text'].map(lambda com: clean_text(com))
test_df['comment_text'] = test_df['comment_text'].map(lambda com: clean_text(com))

x_train = train_df.comment_text
x_test = test_df.comment_text
print(x_train.shape, x_test.shape)

# vect = TfidfVectorizer(max_df=60000, min_df=2, stop_words='english')
vect = TfidfVectorizer(max_features=5000, stop_words='english')
# learn the vocabulary in the training data, then use it to create a document-term matrix
x_train_tfid = vect.fit_transform(x_train)
# transform the test data using the earlier fitted vocabulary, into a document-term matrix
x_test_tfid = vect.transform(x_test)



(159571,) (153164,)


In [26]:
logreg = LogisticRegression(C=12.0)

# create submission file
submission_binary = pd.DataFrame(index=test_df['id'], columns=cols_name)

for label in cols_name:
    print('... Processing {}'.format(label))
    y = train_df[label]
    yt = test_label[label]
    # train the model using X_dtm & y
    logreg.fit(x_train_tfid, y)
    # compute the training accuracy
    y_pred_X = logreg.predict(x_train_tfid)
    y_pred_t = logreg.predict(x_test_tfid)
    print('Training accuracy is {}'.format(accuracy_score(y, y_pred_X)))
    # compute the predicted probabilities for X_test_dtm
    test_y_prob = logreg.predict_proba(x_test_tfid)[:, 1]
    submission_binary[label] = test_y_prob
    print('Test accuracy is {}'.format(accuracy_score(yt, y_pred_t)))


... Processing obscene


Training accuracy is 0.9832300355327722
Test accuracy is 0.4023791491473192
... Processing insult


Training accuracy is 0.9755469352200588
Test accuracy is 0.4018503042490402
... Processing toxic


Training accuracy is 0.9639846839337975
Test accuracy is 0.3860894204904547
... Processing severe_toxic


Training accuracy is 0.9920787611784097
Test accuracy is 0.4145425818077355
... Processing identity_hate


Training accuracy is 0.9939713356436947
Test accuracy is 0.4136872894413831
... Processing threat


Training accuracy is 0.9981199591404453
Test accuracy is 0.4160050664647045


In [25]:
submission_binary


Unnamed: 0_level_0,obscene,insult,toxic,severe_toxic,identity_hate,threat
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0000997932d777bf,0.000367,0.005323,0.009256,0.001810,0.000221,0.000187
000103f0d9cfb60f,0.000241,0.001751,0.009735,0.000194,0.000139,0.000125
000113f07ec002fd,0.005728,0.018129,0.057962,0.001066,0.000105,0.000043
0001b41b1c6bb37e,0.000234,0.000108,0.000339,0.000086,0.000165,0.000010
0001d958c54c6e35,0.055420,0.032446,0.011630,0.000225,0.002878,0.000130
00025465d4725e87,0.003195,0.007433,0.014378,0.000065,0.000416,0.000114
0002bcb3da6cb337,0.995259,0.971146,0.999782,0.270467,0.002014,0.002101
00031b1e95af7921,0.021463,0.028745,0.149344,0.000053,0.000059,0.000041
00037261f536c51d,0.010076,0.032714,0.143440,0.000009,0.000109,0.000054
00040093b2687caa,0.005563,0.006019,0.010736,0.001231,0.000190,0.000217
