In [1]:
import numpy as np
import pandas as pd
import re

from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

In [3]:
test.sample(10)

Unnamed: 0,id,comment_text
78457,82eae306e6e33260,""" \n\n Please do not vandalize pages, as you ..."
12906,15b7cc0cd123de35,"Friends, \n\n Becaue this page is not too long..."
145042,f280e5bba20979ea,==Go fuck yourself i will vandalize any page i...
77814,81ca8828adbf01e5,""" \n\n == Evil! == \n \n Here is an 6/6/6, aka..."
12534,150ee6ce84b3fb2c,""" \n\n Well, most of the site for """"Henkäys Ik..."
67517,706b199419e2864b,""" \n\n \n\n ==You suck so bad im have a Saquo..."
118190,c54af7576338483c,then hitler at a big bag of steming dog shit
144242,f107655d8cc5acf0,""" \n\n == vandalism == \n\n some idiot changed..."
37830,3ec6fc209d85a5f6,==Astrology== \n\n There's a discussion on Tal...
99876,a6ae2f71402b762c,sam fay is gay


### The most common word

In [4]:
def get_words(text):
    
    result = re.findall(r"[a-z]+'?[a-z]+", text.lower())
    
    return result

In [5]:
train_text = train['comment_text'].apply(get_words)

test_text = test['comment_text'].apply(get_words)

all_text = pd.concat([train_text, test_text])

In [6]:
total = list()
for line in all_text:
    total.extend(line)

In [7]:
count = Counter(total)
count.most_common(3)

[('the', 919035), ('to', 539236), ('of', 410839)]

Какое слово встречается чаще всего в объединенном train и test датасете? - **'the'** 

### Увеличение параметра C в Logistic regression увеличивает или уменьшает степень регуляризации?

Увеличение параметра C в Logistic regression увеличивает или уменьшает степень регуляризации?

**C** – обратный коэффициент регуляризации (тот самый C в sklearn-реализации LogisticRegression)

Ответ: **Уменьшает**

### Crossvalidation

In [8]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [9]:
train_text = train_text.apply(lambda text: ' '.join(text))

test_text = test_text.apply(lambda text: ' '.join(text))

all_text = pd.concat([train_text, test_text])

In [10]:
word_vectorizer = TfidfVectorizer() # TfidfVectorizer или CountVectorizer

word_vectorizer.fit(train_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

6820411

In [11]:
C = 3.0



classifier = LogisticRegression(C=C,random_state=7) 

scores= []

for class_name in class_names:

    train_target = train[class_name]

    cv_score = np.mean(cross_val_score(classifier, train_word_features, train_target, scoring='roc_auc'))

    print('CV score for class {} is {}'.format(class_name, cv_score))
    scores.append(cv_score)

print('Total score is {}'.format(np.mean(scores)))



CV score for class toxic is 0.9727163876706908
CV score for class severe_toxic is 0.9833709075076121
CV score for class obscene is 0.9848281270874818
CV score for class threat is 0.9864792424475665
CV score for class insult is 0.9768823487108156
CV score for class identity_hate is 0.9739996482178935
Total score is 0.9797127769403434



Total score is  **0.9797092719099353** 


### Create submission for Kaggle

In [12]:
submission = pd.DataFrame.from_dict({'id': test['id']})

In [13]:
for class_name in class_names:  
    
    train_target = train[class_name]
    
    classifier = LogisticRegression(C=C,random_state=7)
      
    classifier.fit(train_word_features, train_target)
    
    submission[class_name] = classifier.predict_proba(test_word_features)[:, 1]

In [14]:
submission.to_csv('submission.csv', index=False)

In [15]:
submission.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.999723,0.211471,0.998971,0.083908,0.973494,0.345488
1,0000247867823ef7,0.002833,0.000989,0.001455,0.000175,0.003588,0.002186
2,00013b17ad220c46,0.027909,0.003942,0.012249,0.001118,0.015295,0.003797
3,00017563c3f7919a,0.001302,0.001534,0.001606,0.000513,0.002557,0.000362
4,00017695ad8997eb,0.017167,0.002563,0.00525,0.001323,0.006289,0.001948
