In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn import preprocessing, model_selection, metrics
import lightgbm as lgb
import target_encoding as te
import gc

# Tf-Idf
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from scipy.sparse import hstack, csr_matrix
from nltk.corpus import stopwords 
import time

In [2]:
train_df = pd.read_csv("/home/jonas/Documents/Uni/DataChallenge/train.csv", parse_dates=["activation_date"], nrows=1000)
test_df = pd.read_csv("/home/jonas/Documents/Uni/DataChallenge/test.csv", parse_dates=["activation_date"], nrows=1000)
trainshape = train_df.shape
trainindex = train_df.index
test_id = test_df["item_id"].values
print("Train file rows and columns are : ", train_df.shape)
print("Test file rows and columns are : ", test_df.shape)


Train file rows and columns are :  (1000, 18)
Test file rows and columns are :  (1000, 17)


In [3]:
train_y = train_df.deal_probability.copy()
train_df.drop("deal_probability",axis=1, inplace=True)
# Target encode the categorical variables #
cat_vars = ["region", "city", "parent_category_name", "category_name", "user_type", "param_1", "param_2", "param_3", "image_top_1"]
for col in cat_vars:
    train_df[col], test_df[col] = te.target_encode(train_df[col], test_df[col], train_y, min_samples_leaf=100, smoothing=10, noise_level=0.01)

In [4]:
# Combine train and test for preprocessing


df = pd.concat([train_df,test_df],axis=0)
del train_df, test_df
gc.collect()

df.head()

Unnamed: 0,item_id,user_id,region,city,parent_category_name,category_name,param_1,param_2,param_3,title,description,price,item_seq_number,activation_date,user_type,image,image_top_1
0,b912c3c6a6ad,e00f8ff2eaf9,0.144033,0.146267,0.067505,0.151042,0.14765,0.14709,0.149273,Кокоби(кокон для сна),"Кокон для сна малыша,пользовались меньше месяц...",400.0,2,2017-03-28,0.164921,d10c7e016e03247a3bf2d13348fe959fe6f436c1caf64c...,0.146774
1,2dac0150717d,39aeb48f0017,0.150794,0.148362,0.193352,0.149667,0.14719,0.148273,0.151414,Стойка для Одежды,"Стойка для одежды, под вешалки. С бутика.",3000.0,19,2017-03-26,0.16359,79c9392cc51a9c81c6eb91eceb8e552171db39d7142700...,0.146798
2,ba83aefab5dc,91e2f88dd6e3,0.148517,0.146307,0.159303,0.14739,0.145954,0.148292,0.146676,Philips bluray,"В хорошем состоянии, домашний кинотеатр с blu ...",4000.0,9,2017-03-20,0.164253,b7f250ee3f39e1fedd77c141f273703f4a9be59db4b48a...,0.150069
3,02996f1dd2ea,bf5cccea572d,0.149392,0.147632,0.067711,0.147306,0.145003,0.145918,0.145869,Автокресло,Продам кресло от0-25кг,2200.0,286,2017-03-25,0.114029,e6ef97e0725637ea84e3d203e82dadb43ed3cc0a1c8413...,0.146316
4,7c90be56d2ab,ef50846afc0b,0.149061,0.150293,0.14703,0.147764,0.148882,0.147379,0.150098,"ВАЗ 2110, 2003",Все вопросы по телефону.,40000.0,3,2017-03-16,0.16558,54a687a3a0fc1d68aed99bdaaf551c5c70b761b16fd0a2...,0.147714


In [5]:
# Simple Feature Engineering

# Time Data
df["activation_weekday"] = df["activation_date"].dt.weekday
df["activation_monthday"] = df["activation_date"].dt.day

# Price
## Replace Nan with mean in price
#categories = df.category_name.unique()
#region = df.region.unique()
#param1 = df.param_1.unique()
#
#
#df["price_new"] = df["price"].values
#
#for cat in categories:
#    for reg in region:
#        cur_df = df.loc[(df["category_name"] == cat)  & (df["region"] == reg)]["price_new"]
#        cur_df.fillna(np.nanmean(cur_df.values), inplace=True)
#
#
#df["price"] = pd.isna(df["price"])
df["price"] = np.log(df["price"]+0.001)
df["price"].fillna(-999,inplace=True)
df["image_top_1"].fillna(-999,inplace=True)



In [7]:
# Feature Engineering 
df['text_feat'] = df.apply(lambda row: ' '.join([
    str(row['param_1']), 
    str(row['param_2']), 
    str(row['param_3'])]),axis=1) # Group Param Features


# Meta Text Features
textfeats = ["description","text_feat", "title"]
for cols in textfeats:
    df[cols] = df[cols].astype(str) 
    df[cols] = df[cols].astype(str).fillna('nicapotato') # FILL NA
    df[cols] = df[cols].str.lower() # Lowercase all text, so that capitalized words dont get treated differently
    df[cols + '_num_chars'] = df[cols].apply(len) # Count number of Characters
    df[cols + '_num_words'] = df[cols].apply(lambda comment: len(comment.split())) # Count number of Words
    df[cols + '_num_unique_words'] = df[cols].apply(lambda comment: len(set(w for w in comment.split())))
    df[cols + '_words_vs_unique'] = df[cols+'_num_unique_words'] / df[cols+'_num_words'] * 100 # Count Unique Words

print("\n[TF-IDF] Term Frequency Inverse Document Frequency Stage")
russian_stop = set(stopwords.words('russian'))

#tfidf_para = {
#    "stop_words": russian_stop,
#    "analyzer": 'word',
#    "token_pattern": r'\w{1,}',
#    "sublinear_tf": True,
#    "dtype": np.float32,
#    "norm": 'l2',
#    #"min_df":5,
#    #"max_df":.9,
#    "smooth_idf":False
#}
#def get_col(col_name): return lambda x: x[col_name]
#vectorizer = FeatureUnion([
#        ('description',TfidfVectorizer(
#            ngram_range=(1, 2),
#            max_features=16000,
#            **tfidf_para,
#            preprocessor=get_col('description'))),
#        ('text_feat',CountVectorizer(
#            ngram_range=(1, 2),
#            #max_features=7000,
#            preprocessor=get_col('text_feat'))),
#        ('title',TfidfVectorizer(
#            ngram_range=(1, 2),
#            **tfidf_para,
#            #max_features=7000,
#            preprocessor=get_col('title')))
#    ])
#    
#start_vect=time.time()
#vectorizer.fit(df.loc[trainindex,:].to_dict('records'))
#ready_df = vectorizer.transform(df.to_dict('records'))
#tfvocab = vectorizer.get_feature_names()
#print("Vectorization Runtime: %0.2f Minutes"%((time.time() - start_vect)/60))
#
## Drop Text Cols
#df.drop(textfeats, axis=1,inplace=True)

vec1=CountVectorizer(ngram_range=(1,2),dtype=np.uint8,min_df=5, binary=True,max_features=3000) 
m_tfidf1=vec1.fit_transform(df.description)
df.drop(labels=['description'],inplace=True,axis=1)


[TF-IDF] Term Frequency Inverse Document Frequency Stage


(2000, 1911)

In [33]:
pd.concat([df,pd.DataFrame(m_tfidf1.toarray())], axis=1, join='inner')

Unnamed: 0,item_id,user_id,region,city,parent_category_name,category_name,param_1,param_2,param_3,title,...,1901,1902,1903,1904,1905,1906,1907,1908,1909,1910
0,b912c3c6a6ad,e00f8ff2eaf9,0.144033,0.146267,0.067505,0.151042,0.147650,0.147090,0.149273,кокоби(кокон для сна),...,0,0,0,0,0,0,0,0,0,0
1,2dac0150717d,39aeb48f0017,0.150794,0.148362,0.193352,0.149667,0.147190,0.148273,0.151414,стойка для одежды,...,0,0,0,0,0,0,0,0,0,0
2,ba83aefab5dc,91e2f88dd6e3,0.148517,0.146307,0.159303,0.147390,0.145954,0.148292,0.146676,philips bluray,...,0,0,0,0,0,0,0,0,0,0
3,02996f1dd2ea,bf5cccea572d,0.149392,0.147632,0.067711,0.147306,0.145003,0.145918,0.145869,автокресло,...,0,0,0,0,0,0,0,0,0,0
4,7c90be56d2ab,ef50846afc0b,0.149061,0.150293,0.147030,0.147764,0.148882,0.147379,0.150098,"ваз 2110, 2003",...,0,0,0,0,0,0,0,0,0,0
5,51e0962387f7,bbfad0b1ad0a,0.148402,0.146067,0.067787,0.145100,0.147191,0.148708,0.150511,авто люлька,...,0,0,0,0,0,0,0,0,0,0
6,c4f260a2b48a,08f469d2e6f7,0.148173,0.144870,0.189949,0.147334,0.149104,0.146014,0.149411,водонагреватель 100 литров нержавейка плоский,...,0,0,0,0,0,0,0,0,0,0
7,6b71309d6a8a,fef86baa002c,0.146821,0.148660,0.068216,0.039286,0.040773,0.150239,0.145374,бойфренды colins,...,0,0,0,0,0,0,0,0,0,0
8,c5b969cb63a2,055825270190,0.145811,0.147853,0.067432,0.039472,0.041131,0.146177,0.147291,платье,...,0,0,0,0,0,0,0,0,0,0
9,b1570962e68c,f9e8f831d94c,0.148365,0.148809,0.068964,0.051747,0.124596,0.100993,0.151686,полу ботиночки замш натур.бамбини,...,0,0,0,0,0,0,0,0,0,0


In [None]:
train_df = df.iloc[:trainshape[0]]
test_df = df.iloc[trainshape[0]:]

del df
gc.collect()

In [None]:
# Target encode the categorical variables #
cat_vars = ["region", "city", "parent_category_name", "category_name", "user_type", "param_1", "param_2", "param_3", "image_top_1"]
for col in cat_vars:
    train_df[col], test_df[col] = te.target_encode(train_df[col], test_df[col], train_y, min_samples_leaf=100, smoothing=10, noise_level=0.01)

In [None]:
#Drop Cols
cols_to_drop = ["item_id", "user_id", "activation_date", "image"]
train_X = train_df.drop(cols_to_drop, axis=1)
test_X = test_df.drop(cols_to_drop, axis=1)

In [None]:
def run_lgb(train_X, train_y, val_X, val_y, test_X):
    params = {
        "objective" : "regression",
        "metric" : "rmse",
        "num_leaves" : 30,
        "learning_rate" : 0.1,
        "bagging_fraction" : 0.7,
        "feature_fraction" : 0.7,
        "bagging_frequency" : 5,
        "bagging_seed" : 2018,
        "verbosity" : -1
    }
    
    lgtrain = lgb.Dataset(train_X, label=train_y)
    lgval = lgb.Dataset(val_X, label=val_y)
    evals_result = {}
    model = lgb.train(params, lgtrain, 1000, valid_sets=[lgval], early_stopping_rounds=100, verbose_eval=20, evals_result=evals_result)
    
    pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
    return pred_test_y, model, evals_result

In [None]:
# Splitting the data for model training#
dev_X = train_X.iloc[:-200000,:]
val_X = train_X.iloc[-200000:,:]
dev_y = train_y[:-200000]
val_y = train_y[-200000:]
print(dev_X.shape, val_X.shape, test_X.shape)

# Training the model #
pred_test, model, evals_result = run_lgb(dev_X, dev_y, val_X, val_y, test_X)

# Making a submission file #
pred_test[pred_test>1] = 1
pred_test[pred_test<0] = 0
sub_df = pd.DataFrame({"item_id":test_id})
sub_df["deal_probability"] = pred_test
sub_df.to_csv("baseline_lgb.csv", index=False)

In [None]:
fig, ax = plt.subplots(figsize=(12,18))
lgb.plot_importance(model, max_num_features=50, height=0.8, ax=ax)
ax.grid(False)
plt.title("LightGBM - Feature Importance", fontsize=15)
plt.show()