In [19]:
path_train='dataset/train.csv'
path_test='dataset/test.csv'

In [20]:
import pandas as pd

#import TfIdfVectorizer from scikit-learn

from sklearn.feature_extraction.text import TfidfVectorizer

In [21]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [22]:
train = pd.read_csv(path_train)
test = pd.read_csv(path_test)

In [23]:
vocab = pd.concat([train['comment_text'],  test['comment_text']])

In [24]:
clean_word_dict={'ａ': 'a',
 '！': ' !',
 '＋': ' +',
 '－': ' -',
 '．': ' .',
 '０': '0',
 '１': '1',
 '２': '2',
 '３': '3',
 '４': '4',
 '５': '5',
 '６': '6',
 '７': '7',
 '８': '8',
 '９': '9',
 '＝': ' =',
 '？': ' ?',
 'Ａ': 'a',
 'Ｂ': 'b',
 'Ｃ': 'c',
 'Ｄ': 'd',
 'Ｅ': 'e',
 'Ｆ': 'f',
 'Ｇ': 'g',
 'Ｈ': 'h',
 'Ｉ': 'i',
 'Ｊ': 'j',
 'Ｋ': 'k',
 'Ｌ': 'l',
 'Ｍ': 'm',
 'Ｎ': 'n',
 'Ｏ': 'o',
 'Ｐ': 'p',
 'Ｑ': 'q',
 'Ｒ': 'r',
 'Ｓ': 's',
 'Ｔ': 't',
 'Ｕ': 'u',
 'Ｖ': 'v',
 'Ｗ': 'w',
 'Ｘ': 'x',
 'Ｙ': 'y',
 'Ｚ': 'z',
 'ｂ': 'b',
 'ｃ': 'c',
 'ｄ': 'd',
 'ｅ': 'e',
 'ｆ': 'f',
 'ｇ': 'g',
 'ｈ': 'h',
 'ｉ': 'i',
 'ｊ': 'j',
 'ｋ': 'k',
 'ｌ': 'l',
 'ｍ': 'm',
 'ｎ': 'n',
 'ｏ': 'o',
 'ｐ': 'p',
 'ｑ': 'q',
 'ｒ': 'r',
 'ｓ': 's',
 'ｔ': 't',
 'ｕ': 'u',
 'ｖ': 'v',
 'ｗ': 'w',
 'ｘ': 'x',
 'ｙ': 'y',
 'ｚ': 'z'}

In [25]:
import re
import gc
import string
def clean_dataset(word):
    word = word.lower()
    word = re.sub(r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)", "", word)
    word = re.sub(r"(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}", "", word)
    for typo, correct in clean_word_dict.items():
        word = re.sub(typo, " " + correct + " ", word)
    symbols = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
    word=symbols.sub(r' \1 ', word)
    return word

train_comments = []
test_comments = []
for comment in train['comment_text']:
    train_comments.append(clean_dataset(comment))
    
for comment in test['comment_text']:
    test_comments.append(clean_dataset(comment))

In [30]:
transform_function = TfidfVectorizer(
    sublinear_tf=1,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 2),
    max_features=10000)

transform_function.fit(vocab)



In [31]:
from sklearn.model_selection import train_test_split
train_x, val_x, train_y, val_y = train_test_split(train['comment_text'],train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']], test_size=0.2, random_state=2)

In [32]:
comments_train = transform_function.transform(train_x)
comments_val = transform_function.transform(val_x)
comments_test = transform_function.transform(test['comment_text'])

In [33]:
col = ['total_length', 'capitals', 'caps_vs_length','num_exclamation_marks', 'num_question_marks', 'num_punctuation','num_symbols', 'num_words', 'num_unique_words', 'words_vs_unique','num_smilies']

In [34]:
combined = [train_x, val_x, test]

for data in combined:
    data['total_length'] = data['comment_text'].apply(len)
    data['capitals'] = data['comment_text'].apply(lambda x: sum(1 for c in x if c.isupper()))
    data['caps_vs_length'] = data.apply(lambda row: float(row['capitals'])/float(row['total_length']),
                                axis=1)
    data['num_exclamation_marks'] = data['comment_text'].apply(lambda x: x.count('!'))
    data['num_question_marks'] = data['comment_text'].apply(lambda x: x.count('?'))
    data['num_punctuation'] = data['comment_text'].apply(lambda x: sum(x.count(w) for w in '.,;:'))
    data['num_symbols'] = data['comment_text'].apply(lambda x: sum(x.count(w) for w in '*&$%'))
    data['num_words'] = data['comment_text'].apply(lambda x: len(x.split()))
    data['num_unique_words'] = data['comment_text'].apply(lambda x: len(set(w for w in x.split())))
    data['words_vs_unique'] = data['num_unique_words'] / data['num_words']
    data['num_smilies'] = data['comment_text'].apply(lambda x: sum(x.count(w) for w in (':-)', ':)', ';-)', ';)')))

KeyError: 'comment_text'

In [17]:
import scipy
train_x = scipy.sparse.csr_matrix(train_x[col].values)
val_x = scipy.sparse.csr_matrix(val_x[col].values)
test = scipy.sparse.csr_matrix(test[col].values)

KeyError: "None of [Index(['total_length', 'capitals', 'caps_vs_length', 'num_exclamation_marks',\n       'num_question_marks', 'num_punctuation', 'num_symbols', 'num_words',\n       'num_unique_words', 'words_vs_unique', 'num_smilies'],\n      dtype='object')] are in the [index]"