
### OBJECTIVE : Model 1 - Logistic Regression


In [33]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import logging
from sklearn.model_selection import cross_val_score

In [2]:
train = pd.read_csv('train_cleaned.csv')
test = pd.read_csv('test_cleaned.csv')

In [7]:
train.head()

Unnamed: 0.1,Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0,0000997932d777bf,explanation why the edits made under my userna...,0,0,0,0,0,0
1,1,000103f0d9cfb60f,d aww he matches this background colour i am s...,0,0,0,0,0,0
2,2,000113f07ec002fd,hey man i am really not trying to edit war it ...,0,0,0,0,0,0
3,3,0001b41b1c6bb37e,more i cannot make any real suggestions on imp...,0,0,0,0,0,0
4,4,0001d958c54c6e35,you sir are my hero any chance you remember wh...,0,0,0,0,0,0


In [19]:
test.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,yo bitch ja rule is more succesful then you wi...
1,0000247867823ef7,from rfc the title is fine as it is imo
2,00013b17ad220c46,sources zawe ashton on lapland
3,00017563c3f7919a,if you have a look back at the source the info...
4,00017695ad8997eb,i do not anonymously edit articles at all


In [12]:
train = train.drop('Unnamed: 0',axis=1)
test = test.drop('Unnamed: 0',axis=1)

In [21]:
no_comment = test[test['comment_text'].isnull()]
no_comment #these are the rows that had weird special characters and i removed them during the data cleaning process

Unnamed: 0,id,comment_text
4856,08323f2f0a13c416,
6533,0af7effe1fd4b873,
8655,0e81ddea7fec8962,
15672,1a56b331b2acc0da,
16060,1af5554667a8c913,
18377,1ee703ce84fb9a34,
28067,2ebb8aefc22c92f9,
31965,35255b6638eec7b0,
34173,38dbade4aa845e09,
34304,390e5fde2cc12223,


In [24]:
test_subset = test.dropna()

In [26]:
test.shape

(153164, 2)

In [27]:
test_subset.shape

(153120, 2)

In [16]:
field = 'comment_text'

In [28]:
tfidf_vectorizer=TfidfVectorizer(use_idf=True, max_df=0.95,ngram_range=(1,2))
tfidf_vectorizer.fit_transform(train[field].values)
        
train_feature_set = tfidf_vectorizer.transform(train[field].values)
test_feature_set = tfidf_vectorizer.transform(test_subset[field].values)

In [30]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explanation why the edits made under my userna...,0,0,0,0,0,0
1,000103f0d9cfb60f,d aww he matches this background colour i am s...,0,0,0,0,0,0
2,000113f07ec002fd,hey man i am really not trying to edit war it ...,0,0,0,0,0,0
3,0001b41b1c6bb37e,more i cannot make any real suggestions on imp...,0,0,0,0,0,0
4,0001d958c54c6e35,you sir are my hero any chance you remember wh...,0,0,0,0,0,0


In [31]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [34]:
scores = []
submission = pd.DataFrame.from_dict({'id': test_subset['id']})
for class_name in class_names:
    train_target = train[class_name]
    classifier = LogisticRegression(verbose=1, solver='liblinear',random_state=0, C=5, penalty='l2',max_iter=1000)

    cv_score = np.mean(cross_val_score(classifier, train_feature_set, train_target, cv=3, scoring='roc_auc'))
    scores.append(cv_score)
    print('CV score for class {} is {}'.format(class_name, cv_score))

    classifier.fit(train_feature_set, train_target)
    submission[class_name] = classifier.predict_proba(test_feature_set)[:, 1]

print('Total CV score is {}'.format(np.mean(scores)))

#submission.to_csv('submission.csv', index=False)

[LibLinear][LibLinear][LibLinear]CV score for class toxic is 0.9706975653236167
[LibLinear][LibLinear][LibLinear][LibLinear]CV score for class severe_toxic is 0.9839253705141088
[LibLinear][LibLinear][LibLinear][LibLinear]CV score for class obscene is 0.983082080144288
[LibLinear][LibLinear][LibLinear][LibLinear]CV score for class threat is 0.9868736044365808
[LibLinear][LibLinear][LibLinear][LibLinear]CV score for class insult is 0.9763153994405829
[LibLinear][LibLinear][LibLinear][LibLinear]CV score for class identity_hate is 0.9723038113308013
[LibLinear]Total CV score is 0.9788663051983297


In [35]:
submission.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.999768,0.249156,0.999192,0.062819,0.981431,0.459343
1,0000247867823ef7,0.00392,0.001082,0.001817,0.000324,0.004269,0.00179
2,00013b17ad220c46,0.020089,0.003034,0.010786,0.001089,0.011223,0.00309
3,00017563c3f7919a,0.001063,0.000666,0.001035,0.000437,0.000819,0.000382
4,00017695ad8997eb,0.018417,0.002793,0.007288,0.000978,0.006879,0.001787


In [40]:
test_subset.head(10)

Unnamed: 0,id,comment_text
0,00001cee341fdb12,yo bitch ja rule is more succesful then you wi...
1,0000247867823ef7,from rfc the title is fine as it is imo
2,00013b17ad220c46,sources zawe ashton on lapland
3,00017563c3f7919a,if you have a look back at the source the info...
4,00017695ad8997eb,i do not anonymously edit articles at all
5,0001ea8717f6de06,thank you for understanding i think very highl...
6,00024115d4cbde0f,please do not add nonsense to wikipedia such e...
7,000247e83dcc1211,dear god this site is horrible
8,00025358d4737918,only a fool can believe in such numbers the co...
9,00026d1092fe71cc,double redirects when fixing double redirects ...


In [39]:
submission.head(10)

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.999768,0.249156,0.999192,0.062819,0.981431,0.459343
1,0000247867823ef7,0.00392,0.001082,0.001817,0.000324,0.004269,0.00179
2,00013b17ad220c46,0.020089,0.003034,0.010786,0.001089,0.011223,0.00309
3,00017563c3f7919a,0.001063,0.000666,0.001035,0.000437,0.000819,0.000382
4,00017695ad8997eb,0.018417,0.002793,0.007288,0.000978,0.006879,0.001787
5,0001ea8717f6de06,0.007525,0.001359,0.00375,0.002711,0.008888,0.001464
6,00024115d4cbde0f,0.002938,0.000468,0.002952,0.000343,0.002466,0.001435
7,000247e83dcc1211,0.618946,0.004098,0.04289,0.001446,0.082134,0.00466
8,00025358d4737918,0.004713,0.000759,0.005115,0.00058,0.004136,0.001467
9,00026d1092fe71cc,0.001845,0.00042,0.002951,0.000453,0.003876,0.000672


In [36]:
submission.shape

(153120, 7)

In [None]:
submission.to_csv('submission.csv', index=False)