In [1]:
%matplotlib inline
import os
import gc
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
import time
from contextlib import contextmanager
from functools import lru_cache
os.environ['OMP_NUM_THREADS'] = '4'

@contextmanager
def timer(name):
    t0 = time.time()
    yield
    print(f'[{name}] done in {time.time() - t0:.1f} s')
    gc.collect();
    
def reduce_memory(df):
    for c in df.columns:
        if df[c].dtype=='int':
            if df[c].min()<0:
                if df[c].abs().max()<2**7:
                    df[c] = df[c].astype('int8')
                elif df[c].abs().max()<2**15:
                    df[c] = df[c].astype('int16')
                elif df[c].abs().max()<2**31:
                    df[c] = df[c].astype('int32')
                else:
                    continue
            else:
                if df[c].max()<2**8:
                    df[c] = df[c].astype('uint8')
                elif df[c].max()<2**16:
                    df[c] = df[c].astype('uint16')
                elif df[c].max()<2**32:
                    df[c] = df[c].astype('uint32')
                else:
                    continue
        if df[c].dtype=='float64':
            df[c] = df[c].astype('float32')
    return df
DATA_DIR = '../input/avito-demand-prediction/'
target_col = 'deal_probability'
os.listdir(DATA_DIR)

In [2]:
usecols = ['region', 'city', 'parent_category_name', 'category_name', 'param_1', 'param_2', 'param_3', 
           'title', 'description']
train = pd.read_csv(DATA_DIR+'train.csv', usecols=usecols+[target_col])
test = pd.read_csv(DATA_DIR+'test.csv', usecols=usecols)

In [3]:
y = train[target_col].values
del train[target_col]; gc.collect()
train_num = len(train)
df = pd.concat([train, test], ignore_index=True)
del train, test; gc.collect()

In [4]:
df['title'].isnull().sum(), df['description'].isnull().sum()

In [5]:
df['description'].fillna('unknown', inplace=True)

In [6]:
# df = df.head(1000) #for debug

In [7]:
context_cols = ['region', 'city', 'parent_category_name', 'category_name', 'param_1', 'param_2', 'param_3']
df['context'] = ''
for i, c in tqdm(enumerate(context_cols), total=len(context_cols)):
    if i>0:
        df['context'] += ' '
    df['context'] = df['context'] + df[c].fillna('').str.lower()

In [8]:
%%time
import re
import string
from nltk.corpus import stopwords
from nltk import word_tokenize
STOPWORDS = stopwords.words('russian')
puncs = str(string.punctuation).replace('-', '')
translator = str.maketrans(puncs, len(puncs)*' ')
def proc_text(s):
    s = s.translate(translator)
    s = s.strip(' ').lower().split(' ')
    s = [w for w in s if w!='' and w not in STOPWORDS]
    s = ' '.join(s)
    return s
df['context'] = df['context'].apply(lambda s: proc_text(s))
df['desc'] = df['description'].copy()

In [9]:
df = df.assign(**{'text': df['title'] + ' ' + df['desc']})
df = df[['context', 'text', 'title', 'desc']]
gc.collect();
df.head()

## Reference
- https://www.analyticsvidhya.com/blog/2018/02/the-different-methods-deal-text-data-predictive-python/

## 1. Basic Feature Extraction
### 1.1 Number of Words

In [10]:
%%time
fprefix = 'nWord_'
func = lambda x: len(x.split(' '))
for col in ['title', 'desc']:
    df = df.assign(**{
        fprefix+col: df[col].apply(func)
    })
new_col = df[fprefix+'title']+df[fprefix+'desc']
df = df.assign(**{
    fprefix+'text': new_col
})

In [11]:
plt.figure(figsize=[12, 4])
for i, col in enumerate(['text', 'title', 'desc']):
    plt.subplot(1, 3, i+1)
    sns.distplot(df[fprefix+col])
    plt.legend([fprefix+col])
    plt.grid()

### 1.2 Number of characters

In [12]:
%%time
fprefix = 'nChar_'
for col in ['title', 'desc']:
    df = df.assign(**{
        fprefix+col: df[col].str.len()
    })
new_col = df[fprefix+'title']+df[fprefix+'desc']
df = df.assign(**{
    fprefix+'text': new_col
})

In [13]:
plt.figure(figsize=[12, 4])
for i, col in enumerate(['text', 'title', 'desc']):
    plt.subplot(1, 3, i+1)
    sns.distplot(df[fprefix+col])
    plt.legend([fprefix+col])
    plt.grid()

### 1.3 Average Word Length

In [14]:
%%time
fprefix = 'AvgWordLen_'

def avg_word_len(sentence):
    words = sentence.split()
    return (sum(len(word) for word in words)/len(words))
func = lambda x: avg_word_len(x)
for col in ['title', 'desc']:
    df = df.assign(**{
        fprefix+col: df[col].apply(func)
    })
new_col = df[fprefix+'title']*df['nWord_'+'title']+\
            df[fprefix+'desc']*df['nWord_'+'desc']
new_col = new_col / df['nWord_'+'text']
df = df.assign(**{
    fprefix+'text': new_col
})

In [15]:
plt.figure(figsize=[12, 4])
for i, col in enumerate(['text', 'title', 'desc']):
    plt.subplot(1, 3, i+1)
    sns.distplot(df[fprefix+col])
    plt.legend([fprefix+col])
    plt.grid()

### 1.4 Number of stopwords

In [16]:
%%time
fprefix = 'nStop_'
func = lambda x: len([x for x in x.split() if x in set(STOPWORDS)])
for col in ['title', 'desc']:
    df = df.assign(**{
        fprefix+col: df[col].apply(func)
    })
new_col = df[fprefix+'title'] + df[fprefix+'desc']
df = df.assign(**{
    fprefix+'text': new_col
})

In [17]:
plt.figure(figsize=[12, 4])
for i, col in enumerate(['text', 'title', 'desc']):
    plt.subplot(1, 3, i+1)
    sns.distplot(df[fprefix+col])
    plt.legend([fprefix+col])
    plt.grid()

### 1.5 Number of special characters

In [18]:
%%time
fprefix = 'nSpec_'
func = lambda s: len([c for c in str(s) if c in string.punctuation])
for col in ['title', 'desc']:
    df = df.assign(**{
        fprefix+col: df[col].apply(func)
    })
new_col = df[fprefix+'title'] + df[fprefix+'desc']
df = df.assign(**{
    fprefix+'text': new_col
})

In [19]:
plt.figure(figsize=[12, 4])
for i, col in enumerate(['text', 'title', 'desc']):
    plt.subplot(1, 3, i+1)
    sns.distplot(df[fprefix+col])
    plt.legend([fprefix+col])
    plt.grid()

### 1.6 Number of numerics

In [20]:
%%time
fprefix = 'nNum_'
func = lambda s: len(re.findall(r'[0-9]', s))
for col in ['title', 'desc']:
    df = df.assign(**{
        fprefix+col: df[col].apply(func)
    })
new_col = df[fprefix+'title'] + df[fprefix+'desc']
df = df.assign(**{
    fprefix+'text': new_col
})

In [21]:
plt.figure(figsize=[12, 4])
for i, col in enumerate(['text', 'title', 'desc']):
    plt.subplot(1, 3, i+1)
    sns.distplot(df[fprefix+col])
    plt.legend([fprefix+col])
    plt.grid()

### 1.7 Number of Uppercase words

In [22]:
%%time
fprefix = 'nUpper_'
func = lambda s: len(re.findall(r'[A-Я]', s))
for col in ['title', 'desc']:
    df = df.assign(**{
        fprefix+col: df[col].apply(func)
    })
new_col = df[fprefix+'title'] + df[fprefix+'desc']
df = df.assign(**{
    fprefix+'text': new_col
})

In [23]:
plt.figure(figsize=[12, 4])
for i, col in enumerate(['text', 'title', 'desc']):
    plt.subplot(1, 3, i+1)
    sns.distplot(df[fprefix+col])
    plt.legend([fprefix+col])
    plt.grid()

### 1.8 Number of words in Context (Keywords)

In [24]:
%%time
fprefix = 'nKey_'
def num_keywords(x, col, candidate_col='context'):
    words = [w for w in proc_text(x[col]).split() \
          if w in set(x[candidate_col].split())]
    return len(words)
for col in ['title', 'desc']:
    df = df.assign(**{
        fprefix+col: df.apply(lambda x: num_keywords(x, col), axis=1)
    })
new_col = df[fprefix+'title'] + df[fprefix+'desc']
df = df.assign(**{
    fprefix+'text': new_col
})

In [25]:
plt.figure(figsize=[12, 4])
for i, col in enumerate(['text', 'title', 'desc']):
    plt.subplot(1, 3, i+1)
    sns.distplot(df[fprefix+col])
    plt.legend([fprefix+col])
    plt.grid()

## 2. Basic Pre-processing
### 2.1 Lower & Removing Punctuation, Stop Words

In [26]:
%%time
for col in ['title', 'desc']:
    df = df.assign(**{
        col: df[col].apply(lambda x: proc_text(x))
    })
new_col = df['title'] + ' ' + df['desc']
df = df.assign(**{
    'text': new_col
})

### 2.2 Common word removal

In [27]:
top_k = 10
freq = pd.Series(' '.join(df['text']).split()).value_counts()[:top_k]
freq

In [28]:
# %%time
# freq = set(list(freq.index))
# func = lambda x: " ".join(x for x in x.split() if x not in freq)
# for col in ['title', 'desc']:
#     df = df.assign(**{
#         col: df[col].apply(func)
#     })
# new_col = df['title'] + ' ' + df['desc']
# df = df.assign(**{
#     'text': new_col
# })

### 2.3 Rare words removal

In [29]:
# freq = pd.Series(' '.join(df['text']).split()).value_counts()[-top_k:]
# freq

In [30]:
# %%time
# freq = set(list(freq.index))
# func = lambda x: " ".join(x for x in x.split() if x not in freq)
# for col in ['title', 'desc']:
#     df = df.assign(**{
#         col: df[col].apply(func)
#     })
# new_col = df['title'] + ' ' + df['desc']
# df = df.assign(**{
#     'text': new_col
# })

### 2.4 Spelling correction (TIME CONSUMING!)

In [31]:
# %%time
# from textblob import TextBlob
# func = lambda x: str(TextBlob(x).correct())
# for col in ['title', 'desc']:
#     df = df.assign(**{
#         col: df[col].apply(func)
#     })
# new_col = df['title'] + ' ' + df['desc']
# df = df.assign(**{
#     'text': new_col
# })

### 2.5 Lemmatization

In [32]:
%%time
# from textblob import Word
# func = lambda x: " ".join([Word(word).lemmatize() for word in x.split()]) #slow
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
func = lambda x: " ".join([wnl.lemmatize(word) for word in x.split()])
for col in ['title', 'desc']:
    df = df.assign(**{
        col: df[col].apply(func)
    })
new_col = df['title'] + ' ' + df['desc']
df = df.assign(**{
    'text': new_col
})

## 3. Advance Text Processing
### 3.1 N-grams

In [33]:
%%time
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text \
    import CountVectorizer, TfidfVectorizer
tfidf_params = {
    'stop_words': STOPWORDS,
    'analyzer': 'word',
    'token_pattern': r'\w{1,}',
    'sublinear_tf': True,
    'dtype': np.float32,
    'norm': 'l2',
    'smooth_idf':False
}
vectorizer = FeatureUnion([
        ('title+desc', TfidfVectorizer(
            ngram_range=(1, 2),
            max_features=15000,
            **tfidf_params,
            preprocessor=lambda x: x['text'])), #desc text
        ('title', CountVectorizer(
            ngram_range=(1, 2),
            stop_words = STOPWORDS,
            preprocessor=lambda x: x['title']))
])
with timer('vectorizer fitting'):
    vectorizer.fit(df.to_dict('records'))
with timer('vectorizer transforming'):
    text_vector = vectorizer.transform(df.to_dict('records'))

In [34]:
from scipy import sparse
df['vecSum'] = np.array(text_vector.sum(1))
df['vecNPosEle'] = (text_vector>0).sum(1)

In [35]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
dl_feat_idx = pd.read_csv(
    '../input/adp-prepare-kfold-text/textdata.csv', usecols=['eval_set', 'label'])
train_num = (dl_feat_idx['eval_set']!=10).sum()
eval_sets = dl_feat_idx['eval_set'][:train_num].values
y = dl_feat_idx['label'][:train_num].values
del dl_feat_idx; gc.collect()

In [51]:
def get_ridge_pred(use_all_feats=False):
    ridge = Ridge(**{'alpha':30.0, 'fit_intercept':True, 'normalize':use_all_feats, 'copy_X':True,
                    'max_iter':None, 'tol':0.0001, 'solver':'auto', 'random_state':2018})
    pred_train = np.zeros(train_num,)
    pred_test = np.zeros(len(df)-train_num,)
    if use_all_feats:
        X = sparse.hstack([sparse.csr_matrix(df.values[:, 4:].astype('float32')),
                           text_vector])
        X = sparse.csr_matrix(X)
    else:
        X = text_vector
    for valid_fold in range(10):
        print('processing fold %d...'%valid_fold)
        mask_val = eval_sets==valid_fold
        mask_tr = ~mask_val
        ridge.fit(X[:train_num][mask_tr], y[mask_tr])
        pred_train[mask_val] = ridge.predict(X[:train_num][mask_val])
        pred_test += ridge.predict(X[train_num:])
    pred_test = pred_test / 10
    print('Ridge RMSE:', np.sqrt(mean_squared_error(y, pred_train)))
    return pred_train, pred_test

In [37]:
%%time
pred_train, pred_test = get_ridge_pred()

In [38]:
df['ridge_textvec'] = np.hstack([pred_train, pred_test])

### 3.2 Sentiment Analysis (SLOW)

In [39]:
# %%time
# from textblob import TextBlob
# res = df['text'].apply(lambda x: TextBlob(x).sentiment)
# res = np.array(np.array(res).tolist())
# df['sentPop'] = res[:, 0]
# df['sentSub'] = res[:, 1]
# del res; gc.collect();

# Combination & Interaction
## (Comment them to save space)

In [40]:
df['rW_title_desc'] = df['nWord_title'] / df['nWord_desc']
df['rW_title_text'] = df['nWord_title'] / df['nWord_text']
df['rC_title_desc'] = df['nChar_title'] / df['nChar_desc']
df['rC_title_text'] = df['nChar_title'] / df['nChar_text']

In [41]:
df['rUp_title'] = df['nUpper_title'] / df['nChar_title']
df['rUp_desc'] =  df['nUpper_desc'] / df['nChar_desc']
df['rUp_text'] =  df['nUpper_text'] / df['nChar_text']
df['rNum_title'] = df['nNum_title'] / df['nChar_title']
df['rNum_desc'] =  df['nNum_desc'] / df['nChar_desc']
df['rNum_text'] =  df['nNum_text'] / df['nChar_text']
df['rSpec_title'] = df['nSpec_title'] / df['nChar_title']
df['rSpec_desc'] =  df['nSpec_desc'] / df['nChar_desc']
df['rSpec_text'] =  df['nSpec_text'] / df['nChar_text']

In [None]:
df['rStop_title'] = df['nStop_title'] / df['nWord_title']
df['rStop_desc'] =  df['nStop_desc'] / df['nWord_desc']
df['rStop_text'] =  df['nStop_text'] / df['nWord_text']
df['rKey_title'] = df['nKey_title'] / df['nWord_title']
df['rKey_desc'] =  df['nKey_desc'] / df['nWord_desc']
df['rKey_text'] =  df['nKey_text'] / df['nWord_text']

In [42]:
df.head(2).T

In [43]:
for c in ['context', 'text', 'title', 'desc']:
    if c in df.columns:
        del df[c]
gc.collect();

In [44]:
df.columns.tolist()

In [45]:
textmeta_cols = df.columns.tolist()

In [46]:
df.info(verbose=False)

In [47]:
df.describe().T

In [48]:
df.to_csv('textmeta.csv', index=False)