In [1]:
import numpy as np
import pandas as pd
import re
import string

from scipy.sparse import csr_matrix, hstack
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

In [3]:
test.sample(10)

Unnamed: 0,id,comment_text
33506,37b14dbb8c577fab,== trust company ==
145260,f2da318cefce9ea3,== Iowa-class battleships == \n\n Stupid me......
102604,ab4ac4cea5898005,""" \n :::::But the mainstream churches and ideo..."
148663,f881deec4919954f,""" \n\n == Incubation cleanout == \n\n I was st..."
13446,16a3fccb8ef6ff62,:You've produced extremely poor sources and de...
108250,b4a9f37b9d6a3189,:::This user is a sock of . Generally disrupti...
32205,3588201bb031bb48,"Oh, and consorts for the Spanish Netherlands a..."
2931,05127023a71c4458,""" \n :We can describe the naming within the in..."
89691,958c27886d8a00dd,Season pages == \n\n Please remember that ther...
25119,29de040443eb6451,""" \n\n == Eagle Scout mention == \n I have inc..."


### The most common word

In [4]:
def get_words(text):
    
    result = re.findall(r"[a-z]+'?[a-z]+", text.lower())
    
    return result

In [5]:
train_text = train['comment_text'].apply(get_words)

test_text = test['comment_text'].apply(get_words)

all_text = pd.concat([train_text, test_text])

In [6]:
total = list()
for line in all_text:
    total.extend(line)

In [7]:
count = Counter(total)
count.most_common(3)

[('the', 919035), ('to', 539236), ('of', 410839)]

Какое слово встречается чаще всего в объединенном train и test датасете? - **'the'** 

### Увеличение параметра C в Logistic regression увеличивает или уменьшает степень регуляризации?

Увеличение параметра C в Logistic regression увеличивает или уменьшает степень регуляризации?

**C** – обратный коэффициент регуляризации (тот самый C в sklearn-реализации LogisticRegression)

Ответ: **Уменьшает**

### Crossvalidation

In [6]:
train_text = train_text.apply(lambda text: ' '.join(text))

test_text = test_text.apply(lambda text: ' '.join(text))

all_text = pd.concat([train_text, test_text])

In [7]:
# TfidfVectorizer или CountVectorizer
word_vectorizer = TfidfVectorizer() 

word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

In [8]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
class_c = [4, 2, 3, 4, 3, 3]

scores= []

for class_name, C in zip(class_names, class_c):
    
    classifier = LogisticRegression(C=C,random_state=7) 

    train_target = train[class_name]

    cv_score = np.mean(cross_val_score(classifier, train_word_features, train_target, scoring='roc_auc'))

    print('CV score for class {} is {}'.format(class_name, cv_score))
    scores.append(cv_score)

print('Total score is {}'.format(np.mean(scores)))   

CV score for class toxic is 0.9724238498388088
CV score for class severe_toxic is 0.9833079735989355
CV score for class obscene is 0.9843418455895124
CV score for class threat is 0.98638587067384
CV score for class insult is 0.9763643664586121
CV score for class identity_hate is 0.973462305990869
Total score is 0.9793810353584296



Total score is  **0.9793810353584296** 


### Create submission for Kaggle

In [9]:
train['clean_comment'] = train_text
test['clean_comment'] = test_text

In [10]:
#add some features to train and test
train['count_sent'] = train["comment_text"].apply(lambda x: len(re.findall("\n",str(x)))+1)
train['count_words'] = train["clean_comment"].apply(lambda x: len(str(x).split()))
train['count_unique_words'] = train["clean_comment"].apply(lambda x: len(set(str(x).split())))
train['count_letters'] = train["clean_comment"].apply(lambda x: len(str(x)))
train["count_punctuations"] = train["comment_text"].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
train["count_words_upper"] = train["comment_text"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
train["count_words_title"] = train["comment_text"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))


test['count_sent'] = test["comment_text"].apply(lambda x: len(re.findall("\n",str(x)))+1)
test['count_words'] = test["clean_comment"].apply(lambda x: len(str(x).split()))
test['count_unique_words'] = test["clean_comment"].apply(lambda x: len(set(str(x).split())))
test['count_letters'] = test["clean_comment"].apply(lambda x: len(str(x)))
test["count_punctuations"] = test["comment_text"].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
test["count_words_upper"] = test["comment_text"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
test["count_words_title"] = test["comment_text"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))

In [11]:
features_train = train[train.columns[-7:]]
features_test = test[test.columns[-7:]]

In [12]:
features_train = csr_matrix(features_train)
features_test = csr_matrix(features_test)

print('Features_train shape: {}'.format(features_train.shape))
print('Train word features shape: {}'.format(train_word_features.shape))

Features_train shape: (159571, 7)
Train word features shape: (159571, 286201)


In [13]:
x_train = hstack([train_word_features, features_train])

x_test = hstack([test_word_features, features_test])

In [14]:
submission = pd.DataFrame.from_dict({'id': test['id']})

In [15]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
class_c = [4, 2, 3, 4, 3, 3]

for class_name, C in zip(class_names, class_c):
    
    y_train = train[class_name]
    
    classifier = LogisticRegression(C=C,random_state=7)
      
    classifier.fit(x_train, y_train)
    
    submission[class_name] = classifier.predict_proba(x_test)[:, 1]

In [60]:
# submission.to_csv('submission_1.csv', index=False)

In [17]:
submission.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.069878,0.000326,0.035369,1.42325e-09,0.030425,0.000717
1,0000247867823ef7,0.22561,0.045461,0.097415,0.02221269,0.094416,0.032334
2,00013b17ad220c46,0.204816,0.066885,0.108946,0.04843859,0.1072,0.022976
3,00017563c3f7919a,0.160552,0.009108,0.065,0.0001007349,0.061581,0.014589
4,00017695ad8997eb,0.248664,0.062445,0.104271,0.08405659,0.102166,0.083637


In [23]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
C_list = [0.01, 1, 3, 5, 10, 20, 50, 100, 10000]

for c in C_list:

    scores= []
    
    print('Current C: ', c)
    
    for class_name in class_names:

        classifier = LogisticRegression(C=c,random_state=7) 

        y_train = train[class_name]

        cv_score = np.mean(cross_val_score(classifier, x_train, y_train, scoring='roc_auc'))

        print('CV score for class {} is {}'.format(class_name, cv_score))
        scores.append(cv_score)

    print('Total score is {}'.format(np.mean(scores)))
    print('*' * 20)

Current C:  0.01
CV score for class toxic is 0.8144704594381428
CV score for class severe_toxic is 0.7999057832436143
CV score for class obscene is 0.7479850343725346
CV score for class threat is 0.6168210961621791
CV score for class insult is 0.8041743293710826
CV score for class identity_hate is 0.6320139195300842
Total score is 0.7358951036862728
********************
Current C:  1
CV score for class toxic is 0.9575528140589791
CV score for class severe_toxic is 0.7455828483374138
CV score for class obscene is 0.6817178634041001
CV score for class threat is 0.6100280086907358
CV score for class insult is 0.7961432283919191
CV score for class identity_hate is 0.6314722151285254
Total score is 0.7370828296686122
********************
Current C:  3
CV score for class toxic is 0.9671816206078242
CV score for class severe_toxic is 0.7455679724075321
CV score for class obscene is 0.6897277172860922
CV score for class threat is 0.6091270093464576
CV score for class insult is 0.68992334205711