### General information

In this kernel I'll analyse data from Malicious Intent Detection Challenge.

We need to identify injections among neutral input vectors using machine learning.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# import seaborn as sns
%matplotlib inline
from scipy.sparse import csr_matrix
import scipy as sp

import eli5
from nltk.tokenize import TweetTokenizer
import datetime
import lightgbm as lgb
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from scipy import stats
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split, cross_val_score
from wordcloud import WordCloud
from collections import Counter
from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
pd.set_option('max_colwidth',400)
import msgpack
from sklearn.decomposition import  PCA,TruncatedSVD
import re
from sklearn.cluster import MiniBatchKMeans, DBSCAN, KMeans
from collections import Counter

### Loading data

In [3]:
info = pd.read_csv('data/train_info.csv')
with open('data/train_msgpack.msgpack', 'rb') as data_file:
    train = msgpack.unpack(data_file)
with open('data/test_msgpack.msgpack', 'rb') as data_file:
    test = msgpack.unpack(data_file)

train = pd.DataFrame(train)
test = pd.DataFrame(test)
train.columns = ['id', 'text']
test.columns = ['id', 'text']

train['text'] = train['text'].astype(str)
test['text'] = test['text'].astype(str)

tst = train['text'].copy()

train['len'] = train.text.apply(lambda x: len(x))
test['len'] = test.text.apply(lambda x: len(x))

train['new_text'] = train.text.str.replace(r'[^a-zA-Z ]', ' ').str.split()
test['new_text'] = test.text.str.replace(r'[^a-zA-Z ]', ' ').str.split()

train['len_new_text'] = train['new_text'].apply(lambda x: len(x))
test['len_new_text'] = test['new_text'].apply(lambda x: len(x))

train['new_text']  = train['new_text'].apply(lambda x: ' '.join([i for i in x if len(i)>2]))
test['new_text']  = test['new_text'].apply(lambda x: ' '.join([i for i in x if len(i)>2]))
# test features
train['num_low'] = train['text'].str.replace(r'[^a-z]', '').str.len()
test['num_low'] = test['text'].str.replace(r'[^a-z]', '').str.len()

train['num_up'] = train['text'].str.replace(r'[^A-Z]', '').str.len()
test['num_up'] = test['text'].str.replace(r'[^A-Z]', '').str.len()

train['num_letters'] = train['num_up']+train['num_low']
test['num_letters'] = test['num_up']+test['num_low']

train['numbers'] = train['text'].str.replace(r'[^\d ]', ' ')
test['numbers'] = test['text'].str.replace(r'[^\d ]', '')

train['numbers_list'] = train['numbers'].str.split()
test['numbers_list'] = test['numbers'].str.split()

train['numbers'] = train['numbers'].apply(lambda x: len(x))
test['numbers'] = test['numbers'].apply(lambda x: len(x))

train['numbers_1'] = train['numbers_list'].apply(lambda x: len([i for i in x if len(i)==1]))
test['numbers_1'] = test['numbers_list'].apply(lambda x: len([i for i in x if len(i)==1]))

train['numbers_2'] = train['numbers_list'].apply(lambda x: len([i for i in x if len(i)==2]))
test['numbers_2'] = test['numbers_list'].apply(lambda x: len([i for i in x if len(i)==2]))

train['numbers_3'] = train['numbers_list'].apply(lambda x: len([i for i in x if len(i)==3]))
test['numbers_3'] = test['numbers_list'].apply(lambda x: len([i for i in x if len(i)==3]))

train['numbers_more'] = train['numbers_list'].apply(lambda x: len([i for i in x if len(i)>3]))
test['numbers_more'] = test['numbers_list'].apply(lambda x: len([i for i in x if len(i)>3]))

all_chars = set()
for text in train.text:
    all_chars = all_chars | set(text)
    
for char in all_chars:
    train['num_'+char] = train.text.apply(lambda x: x.count(char))
    test['num_'+char] = test.text.apply(lambda x: x.count(char))
    
train['text'] = train['text'].str.replace(r'\d{4,}', 'ࠉ').str.replace(r'\d{3,}', 'ࠈ').str.replace(r'\d{2,}', 'ࠇ').str.replace(r'\d{1,}', 'ࠔ')
test['text'] = test['text'].str.replace(r'\d{4,}', 'ࠉ').str.replace(r'\d{3,}', 'ࠈ').str.replace(r'\d{2,}', 'ࠇ').str.replace(r'\d{1,}', 'ࠔ')

info = pd.merge(train, info, on='id')
y = np.array([1 if i == True else 0 for i in info.injection.values])

In [4]:
%%time
vectorizer = TfidfVectorizer(use_idf=True,
    ngram_range=(1, 2), analyzer='char', norm=None,  min_df = 0.001,max_df = 0.7)

full_text = list(train['text'].values) + list(test['text'].values)
vectorizer.fit(full_text)
train_vectorized1 = vectorizer.transform(train['text'])
test_vectorized1 = vectorizer.transform(test['text'])

CPU times: user 29.2 s, sys: 684 ms, total: 29.9 s
Wall time: 29.9 s


In [5]:
%%time
vectorizer = TfidfVectorizer(use_idf=True,
    ngram_range=(1, 1), analyzer='word', norm=None,  min_df = 0.004,max_df = 0.7)

full_text = list(train['new_text'].values) + list(test['new_text'].values)
vectorizer.fit(full_text)
train_vectorized_text = vectorizer.transform(train['new_text'])
test_vectorized_text = vectorizer.transform(test['new_text'])

CPU times: user 4.32 s, sys: 44.1 ms, total: 4.37 s
Wall time: 4.4 s


In [6]:
columns_to_add = ['len','len_new_text']+['num_'+i for i in list(all_chars) if i not in [str(j) for j in range(10)]]\
+ ['num_low','num_up','numbers_1','numbers_2','numbers_3','numbers_more','numbers']

new_train = train[columns_to_add]
new_test = test[columns_to_add]

new_train = sp.sparse.hstack((new_train,train_vectorized1), format = 'csr')
new_test = sp.sparse.hstack((new_test,test_vectorized1), format = 'csr')

new_train = sp.sparse.hstack((new_train,train_vectorized_text), format = 'csr')
new_test = sp.sparse.hstack((new_test,test_vectorized_text), format = 'csr')

In [24]:
%%time
lgb = LGBMClassifier(boosting = 'dart',#goss
                     n_estimators = 300,
                     learning_rate=0.25,
#                      n_estimators = 700,
#                      learning_rate=0.03,
                     random_state = 2,
                     feature_fraction = 0.85,
                     num_leaves = 100,
#                      max_bin=200,
                     min_data_in_leaf = 90,
                     
                    )
scores = cross_val_score(lgb, new_train, y, scoring='roc_auc', cv=5)
print('Cross-validation mean auc {0:.4f}%, std {1:.4f}.'.format(np.mean(scores) * 100, np.std(scores) * 100))

Cross-validation mean auc 99.9865%, std 0.0036.
CPU times: user 21min, sys: 3.11 s, total: 21min 4s
Wall time: 2min 44s


In [None]:
lgb.fit(new_train, y)

In [None]:
sub = pd.read_csv('data/sample_submission.csv')
pred = lgb.predict_proba(new_test)
sub['injection'] = pred
sub.head()
sub.to_csv('sub.csv', index=False)