In [None]:
import re
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from scipy.sparse import hstack

RANDOM_STATE = 42

train_path = 'data/train.csv'
test_path = 'data/test_X.csv'

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

print('train shape:', train_df.shape)
print('test shape:', test_df.shape)
print('positive rate:', float(train_df['is_fake'].mean()))

train_df[['title_name','description','brand','category']] = train_df[['title_name','description','brand','category']].fillna('')
test_df[['title_name','description','brand','category']] = test_df[['title_name','description','brand','category']].fillna('')

train_df.head(2)


train shape: (138039, 26)
test shape: (59159, 25)
positive rate: 0.07734770608306349


Unnamed: 0,id,is_fake,brand,description,title_name,category,rating_1_count,rating_2_count,rating_3_count,rating_4_count,...,item_count_sales7,item_count_sales30,item_count_sales90,item_count_returns7,item_count_returns30,item_count_returns90,item_variety_count,item_available_count,seller_time_alive,seller_id
0,0,0,ACTRUM,"Мешки пылесборники для пылесоса PHILIPS, 10 шт...","Мешки для пылесоса PHILIPS TRIATLON, синтетиче...",Пылесборник,6.0,4.0,4.0,3.0,...,2,19,61,0,0,1,1.0,1.0,1860.0,1218
1,1,0,Red Line,Защитная силиконовая крышка обьектива GoPro He...,Защитная крышка Redline на экшн-камеру GoPro (...,Крышка для объектива,,,,,...,0,0,0,0,0,0,1.0,1.0,1757.0,1374


In [None]:
def normalize_text(s):
    s = str(s)
    s = s.lower()
    s = re.sub(r'<[^>]+>', ' ', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s


def make_text(df: pd.DataFrame) -> pd.Series:
    title_raw = df['title_name'].fillna('').astype(str)
    desc_raw = df['description'].fillna('').astype(str)
    brand_raw = df['brand'].fillna('').astype(str)
    cat_raw = df['category'].fillna('').astype(str)

    empty_title = (title_raw.str.strip() == '').astype(int).astype(str)
    empty_desc = (desc_raw.str.strip() == '').astype(int).astype(str)
    empty_brand = (brand_raw.str.strip() == '').astype(int).astype(str)
    empty_cat = (cat_raw.str.strip() == '').astype(int).astype(str)

    title = df['title_name'].map(normalize_text)
    desc = df['description'].map(normalize_text)

    parts = []

    brand = df['brand'].map(normalize_text)
    cat = df['category'].map(normalize_text)
    parts.append('brand:' + brand)
    parts.append('cat:' + cat)

    parts.append('empty_title:' + empty_title)
    parts.append('empty_desc:' + empty_desc)
    parts.append('empty_brand:' + empty_brand)
    parts.append('empty_cat:' + empty_cat)

    parts.append('title:' + title)
    parts.append('desc:' + desc)

    out = parts[0]
    for p in parts[1:]:
        out = out + ' ' + p
    return out

X_text = make_text(train_df)
X_test_text = make_text(test_df)
y = train_df['is_fake'].astype(int).values

print(X_text.iloc[0][:300])


brand:actrum cat:пылесборник empty_title:0 empty_desc:0 empty_brand:0 empty_cat:0 title:мешки для пылесоса philips triatlon, синтетические, многослойные, тип: hr 6947 desc:мешки пылесборники для пылесоса philips, 10 шт., синтетические, многослойные, бренд: actrum, арт. ak-10/10, тип оригинального ме


In [None]:
word_tfidf = TfidfVectorizer(
    max_features=120_000,
    ngram_range=(1, 3),
    min_df=2,
    token_pattern=r'(?u)\b\w+\b',
    lowercase=False,
)
char_tfidf = TfidfVectorizer(
    max_features=150_000,
    analyzer='char_wb',
    ngram_range=(3, 6),
    min_df=3,
)

Xw = word_tfidf.fit_transform(X_text)
Xc = char_tfidf.fit_transform(X_text)
X = hstack([Xw, Xc]).tocsr()

Xw_test = word_tfidf.transform(X_test_text)
Xc_test = char_tfidf.transform(X_test_text)
X_test = hstack([Xw_test, Xc_test]).tocsr()

print('X shape:', X.shape)
print('X_test shape:', X_test.shape)


X shape: (138039, 270000)
X_test shape: (59159, 270000)


In [None]:
clf_params = dict(
    loss='log_loss',
    penalty='elasticnet',
    alpha=3e-6,
    l1_ratio=0.15,
    class_weight='balanced',
    max_iter=3000,
    tol=1e-3,
    random_state=RANDOM_STATE,
)

clf_params = dict(
    loss='log_loss',    
    penalty='elasticnet',
    alpha=3e-6,
    l1_ratio=0,
    class_weight='balanced',
    max_iter=3000,
    tol=1e-3,
    random_state=RANDOM_STATE,
)

In [14]:
clf_params = dict(
    loss='log_loss',
    penalty='elasticnet',
    alpha=1e-6,
    l1_ratio=0.3, # 0.3
    class_weight='balanced',
    max_iter=2000,
    tol=1e-3,
    random_state=RANDOM_STATE+3,
)

In [None]:
clf = SGDClassifier(**clf_params)
clf.fit(X, y)

test_proba = clf.predict_proba(X_test)[:, 1]

sub_text = pd.DataFrame({'id': test_df['id'].values, 'is_fake': test_proba})
sub_text.to_csv('new_tdata/sub_text.csv', index=False)
print('Saved new_tdata/sub_text.csv', sub_text.shape)
sub_text.head()


Saved new_tdata/sub_text.csv (59159, 2)


Unnamed: 0,id,is_fake
0,138039,0.020363
1,138040,0.048996
2,138041,0.02756
3,138042,0.029574
4,138043,0.017317


In [None]:
clf_params = dict(
    loss='log_loss',
    penalty='elasticnet',
    alpha=1e-6, #1e-6
    l1_ratio=0.15, # 0.15
    class_weight='balanced',
    max_iter=2000,
    tol=1e-3,
    random_state=RANDOM_STATE+3,
)

In [None]:
import numpy as np
import pandas as pd

from sklearn.linear_model import SGDClassifier

# Requires that X, X_test, y, test_df already exist.

MODEL_CFGS = [
    dict(name='sgd_a3e-6_l10.00_rs1_1', alpha=1e-6, l1_ratio=0.15, random_state=RANDOM_STATE+1),
    dict(name='sgd_a3e-6_l10.00_rs1_2', alpha=1e-6, l1_ratio=0.15, random_state=RANDOM_STATE+2),
    dict(name='sgd_a3e-6_l10.00_rs1_3', alpha=1e-6, l1_ratio=0.15, random_state=RANDOM_STATE+3),
    # dict(name='sgd_a3e-6_l10.00_rs1_4', alpha=3e-6, l1_ratio=0.2, random_state=RANDOM_STATE+4),
    # dict(name='sgd_a3e-6_l10.00_rs1_5', alpha=3e-6, l1_ratio=0.2, random_state=RANDOM_STATE+5),
    # dict(name='sgd_a3e-6_l10.05_rs2_6', alpha=3e-6, l1_ratio=0.2, random_state=RANDOM_STATE+6),
    dict(name='sgd_a1e-6_l10.10_rs3_7', alpha=1e-6, l1_ratio=0.2, random_state=RANDOM_STATE+6),
    dict(name='sgd_a1e-6_l10.10_rs3_7', alpha=1e-6, l1_ratio=0.2, random_state=RANDOM_STATE+7),
    dict(name='sgd_a1e-5_l10.00_rs4_8', alpha=1e-6, l1_ratio=0.15, random_state=RANDOM_STATE+8),
    # dict(name='sgd_a1e-6_l10.20_rs5_9', alpha=1e-6, l1_ratio=0.20, random_state=RANDOM_STATE+9),
]

base_params = dict(
    loss='log_loss',
    penalty='elasticnet',
    class_weight='balanced',
    max_iter=2000,
    tol=1e-3,
)

preds = []
for cfg in MODEL_CFGS:
    print('fit', cfg['name'])
    params = dict(base_params)
    params.update({k: v for k, v in cfg.items() if k != 'name'})

    clf = SGDClassifier(**params)
    clf.fit(X, y)

    preds.append(clf.predict_proba(X_test)[:, 1].astype(float))

P = np.vstack(preds)  # (n_models, n_test)


def to_rank01(x: np.ndarray) -> np.ndarray:
    order = x.argsort(kind='mergesort')
    ranks = np.empty_like(order, dtype=float)
    ranks[order] = np.linspace(0.0, 1.0, num=len(x), endpoint=True)
    return ranks

R = np.vstack([to_rank01(P[i]) for i in range(P.shape[0])])
r_mean = R.mean(axis=0)
out_rank = 'new_tdata/sub_text_sgd_ens_fullfit_rank.csv'
pd.DataFrame({'id': test_df['id'].values, 'is_fake': r_mean}).to_csv(out_rank, index=False)
print('saved', out_rank)


fit sgd_a3e-6_l10.00_rs1_1
fit sgd_a3e-6_l10.00_rs1_2
fit sgd_a3e-6_l10.00_rs1_3
fit sgd_a1e-6_l10.10_rs3_7
fit sgd_a1e-6_l10.10_rs3_7
fit sgd_a1e-5_l10.00_rs4_8
saved new_tdata/sub_text_sgd_ens_fullfit_rank.csv
