In [1]:
import numpy as np
import pandas as pd
import re

from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

In [3]:
test.sample(10)

Unnamed: 0,id,comment_text
5091,0898121f65ea5155,""" \n\n == Answering your question and giving m..."
79871,854bd35431ac556b,""" \n :::I did not even try to support a Fursov..."
30240,32524f26169f5b2b,Oppose The folly which South Koreans performed...
52180,56999cb14982edb2,":Thanks for your edits, however I note you rem..."
121835,cb7e8fef78d08dc7,== Our Lady of Vilnius Church (New York City) ...
85277,8e40c565c9b78b54,* You're probably thinking of shrunken heads.
15590,1a36fa3289a1375f,**Which happened before you reverted? I made a...
73654,7ac0a0d53643335c,You're a little bitch. get raped.
137716,e630f9980916f118,بكره انا امسافر على المانيا بنفع تيجي معي ازا ...
65179,6c6d7f4f4cf4c9e2,""" 2013 (UTC) \n\n ===Southern Slave Codes=== \..."


### The most common word

In [4]:
train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text], ignore_index=True)

In [5]:
def get_words(text):
    result = re.split(r'\s+|\.', text.lower())
    return result

In [6]:
total = list()
for line in all_text:
    total.extend(get_words(line))

In [7]:
count = Counter(total)
count.most_common(3)

[('', 1097088), ('the', 902873), ('to', 534076)]

Какое слово встречается чаще всего в объединенном train и test датасете? - **'the'** 

### Увеличение параметра C в Logistic regression увеличивает или уменьшает степень регуляризации?

Увеличение параметра C в Logistic regression увеличивает или уменьшает степень регуляризации?

**C** – обратный коэффициент регуляризации (тот самый C в sklearn-реализации LogisticRegression)

Ответ: **Уменьшает**

### Crossvalidation

In [8]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])

In [9]:
word_vectorizer = TfidfVectorizer() # TfidfVectorizer или CountVectorizer

word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)

In [10]:
C = 3.2

classifier = LogisticRegression(C=C,random_state=7) 

scores= []

for class_name in class_names:
    
    train_target = train[class_name]

    cv_score = np.mean(cross_val_score(classifier, train_word_features, train_target, scoring='roc_auc'))

    print('CV score for class {} is {}'.format(class_name, cv_score))
    scores.append(cv_score)
    
print('Total score is {}'.format(np.mean(scores)))



CV score for class toxic is 0.97193119880727
CV score for class severe_toxic is 0.9830390499432613
CV score for class obscene is 0.9840424143183912
CV score for class threat is 0.9859612844077553
CV score for class insult is 0.9760496985195051
CV score for class identity_hate is 0.9734729045817354
Total score is 0.979082758429653



```
C = 3.2
word_vectorizer = TfidfVectorizer()
Total score is 0.979082758429653
```

### Create submission for Kaggle

In [11]:
submission = pd.DataFrame.from_dict({'id': test['id']})

In [12]:
word_vectorizer = TfidfVectorizer()

word_vectorizer.fit(train_text)

train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

for class_name in class_names:  
    
    train_target = train[class_name]
    
    classifier = LogisticRegression(C=C,random_state=7)
      
    classifier.fit(train_word_features, train_target)
    
    submission[class_name] = classifier.predict_proba(test_word_features)[:, 1]

In [13]:
submission.to_csv('submission.csv', index=False)

In [15]:
submission.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.999802,0.204379,0.999182,0.086059,0.974859,0.356259
1,0000247867823ef7,0.002534,0.000968,0.001359,0.000166,0.00351,0.001906
2,00013b17ad220c46,0.027194,0.003909,0.012379,0.001115,0.015274,0.003759
3,00017563c3f7919a,0.001358,0.001461,0.001478,0.000507,0.002332,0.000345
4,00017695ad8997eb,0.016181,0.002532,0.005036,0.001332,0.006197,0.001797
