In [57]:
import numpy as np
import pandas as pd
import re

from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [58]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

In [59]:
test.sample(10)

Unnamed: 0,id,comment_text
121392,cac587e90078d15f,"by an anonymous coward, so delete me/this if u..."
96173,a06bb0e7c790f882,""" \n\n ==Conlang WikiProject== \n\n Hello! \n\..."
24887,297cf3389699a867,::I have checked the other sources used for th...
8955,0efa9069f6464add,"""::Antandrus, look at the source of the proble..."
9942,109a7d6a95a67806,i know him personally it tru you piece of shit...
104940,af2f2f32e14adfa9,""" \n\n == I think a notable vid is == \n\n The..."
85573,8ec1ed0d7de31f6e,:::That is not what a peer review is; the whol...
6286,0a8d7b313c242949,""" \n\n Yes I agree Diivoo. Because as the week..."
100062,a700172816c83f54,== November 2008 == \n Please stop your disru...
100599,a7df58071b97598b,""" \n\n Thanks. I've also read it earlier that ..."


### The most common word

In [60]:
def get_words(text):
    result = re.findall(r"[a-z]+'?[a-z]+", text.lower())

    return result

In [61]:
train_text = train['comment_text'].apply(get_words)

test_text = test['comment_text'].apply(get_words)

all_text = pd.concat([train_text, test_text])

In [62]:
total = list()
for line in all_text:
    total.extend(line)

In [63]:
count = Counter(total)
count.most_common(3)

[('the', 919035), ('to', 539236), ('of', 410839)]

Какое слово встречается чаще всего в объединенном train и test датасете? - **'the'** 

### Увеличение параметра C в Logistic regression увеличивает или уменьшает степень регуляризации?

Увеличение параметра C в Logistic regression увеличивает или уменьшает степень регуляризации?

**C** – обратный коэффициент регуляризации (тот самый C в sklearn-реализации LogisticRegression)

Ответ: **Уменьшает**

### Crossvalidation

In [64]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [65]:
train_text = train_text.apply(lambda text: ' '.join(text))

test_text = test_text.apply(lambda text: ' '.join(text))

all_text = pd.concat([train_text, test_text])

In [66]:
word_vectorizer = TfidfVectorizer() # TfidfVectorizer или CountVectorizer

word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

In [67]:
C = 3.2

classifier = LogisticRegression(C=C,random_state=7) 

scores= []

for class_name in class_names:
    
    train_target = train[class_name]

    cv_score = np.mean(cross_val_score(classifier, train_word_features, train_target, scoring='roc_auc'))

    print('CV score for class {} is {}'.format(class_name, cv_score))
    scores.append(cv_score)
    
print('Total score is {}'.format(np.mean(scores)))



CV score for class toxic is 0.9723319230256943
CV score for class severe_toxic is 0.98297538728988
CV score for class obscene is 0.9843529356988294
CV score for class threat is 0.9862611762434984
CV score for class insult is 0.9763498224793973
CV score for class identity_hate is 0.9734333948249292
Total score is 0.9792841065937047



```
C = 3.2
word_vectorizer = TfidfVectorizer()
Total score is 0.979082758429653
```

### Create submission for Kaggle

In [68]:
submission = pd.DataFrame.from_dict({'id': test['id']})

In [69]:
for class_name in class_names:  
    
    train_target = train[class_name]
    
    classifier = LogisticRegression(C=C,random_state=7)
      
    classifier.fit(train_word_features, train_target)
    
    submission[class_name] = classifier.predict_proba(test_word_features)[:, 1]

In [71]:
submission.to_csv('submission_new.csv', index=False)

In [70]:
submission.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.999519,0.183758,0.998195,0.072261,0.962329,0.301808
1,0000247867823ef7,0.002827,0.000982,0.001422,0.000164,0.003538,0.002218
2,00013b17ad220c46,0.030333,0.00432,0.012821,0.00117,0.015787,0.003999
3,00017563c3f7919a,0.001273,0.001621,0.0016,0.000495,0.002601,0.000351
4,00017695ad8997eb,0.01639,0.002586,0.005237,0.00133,0.006241,0.001918
