In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn import preprocessing, model_selection, metrics
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import target_encoding as te
import gc

# Tf-Idf
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from scipy.sparse import hstack, csr_matrix
from nltk.corpus import stopwords 
import time

#Images
from PIL import Image
import zipfile

In [2]:
train_df = pd.read_csv("/home/jonas/Documents/Uni/DataChallenge/train.csv", parse_dates=["activation_date"], nrows=1000)
test_df = pd.read_csv("/home/jonas/Documents/Uni/DataChallenge/test.csv", parse_dates=["activation_date"], nrows=1000)
trainindex = train_df.index
testindex = test_df.index
test_id = test_df["item_id"].values
print("Train file rows and columns are : ", train_df.shape)
print("Test file rows and columns are : ", test_df.shape)


Train file rows and columns are :  (1000, 18)
Test file rows and columns are :  (1000, 17)


In [3]:
train_y = train_df.deal_probability.copy()
train_df.drop("deal_probability",axis=1, inplace=True)
# Target encode the categorical variables #
cat_vars = ["region", "city", "parent_category_name", "category_name", "user_type", "param_1", "param_2", "param_3", "image_top_1"]
for col in cat_vars:
    train_df[col], test_df[col] = te.target_encode(train_df[col], test_df[col], train_y, min_samples_leaf=100, smoothing=10, noise_level=0.01)

In [30]:
# Image Featues
cols_to_add = ['width', 'height', 'img_mean_color', 'img_std_color']
train_df = pd.concat([train_df,pd.DataFrame(columns=cols_to_add)])
test_df = pd.concat([test_df,pd.DataFrame(columns=cols_to_add)])
train_df[cols_to_add] = train_df[cols_to_add].apply(pd.to_numeric)
test_df[cols_to_add] = test_df[cols_to_add].apply(pd.to_numeric)

In [31]:
train_df.dtypes

activation_date         datetime64[ns]
category_name                  float64
city                           float64
description                     object
height                         float64
image                           object
image_top_1                    float64
img_mean_color                 float64
img_std_color                  float64
item_id                         object
item_seq_number                float64
param_1                        float64
param_2                        float64
param_3                        float64
parent_category_name           float64
price                          float64
region                         float64
title                           object
user_id                         object
user_type                      float64
width                          float64
dtype: object

In [27]:
def get_image_features(df, image_id, image):
    dat = np.array(image)
    df.loc[df['image'] == image_id, 'width'] = dat[0].size[0]
    df.loc[df['image'] == image_id, 'height'] = dat[0].size[1]
    df.loc[df['image'] == image_id, 'img_mean_color'] = np.mean(dat[1].flatten())
    df.loc[df['image'] == image_id, 'img_std_color'] = np.std(dat[1].flatten())
    return    

In [29]:
train_zip = zipfile.ZipFile('../input/avito-demand-prediction/train_jpg.zip', 'r')
test_zip = zipfile.ZipFile('../input/avito-demand-prediction/test_jpg.zip', 'r')
zips = [train_zip, test_zip]
def insert_image_features(zipfile):
    df = test_df
    if(zipfile == train_zip):
        df = train_df
    files_in_zip = sorted(zipfile.namelist())
    for i,filename in enumerate(files_in_zip):
        if filename.endswith('.jpg'):
            file = train_zip.open(files_in_zip[i])
            img = Image.open(file)
            image_id = file.split('/')[-1].split('.')[0]
            get_image_features(df, image_id, img)
return

executor = concurrent.futures.ProcessPoolExecutor(2)
futures = [executor.submit(try_my_operation, item) for task in zips]
concurrent.futures.wait(futures)
        
        

FileNotFoundError: [Errno 2] No such file or directory: '../input/avito-demand-prediction/train_jpg.zip'

In [None]:
# Combine train and test for rest of preprocessing

df = pd.concat([train_df,test_df],axis=0)
del train_df, test_df
gc.collect()

df.head()

In [None]:
# Simple Feature Engineering

# Time Data
df["activation_weekday"] = df["activation_date"].dt.weekday
df["activation_monthday"] = df["activation_date"].dt.day

# Price
## Replace Nan with mean in price
#categories = df.category_name.unique()
#region = df.region.unique()
#param1 = df.param_1.unique()
#
#
#df["price_new"] = df["price"].values
#
#for cat in categories:
#    for reg in region:
#        cur_df = df.loc[(df["category_name"] == cat)  & (df["region"] == reg)]["price_new"]
#        cur_df.fillna(np.nanmean(cur_df.values), inplace=True)
#
#
#df["price"] = pd.isna(df["price"])
df["price"] = np.log(df["price"]+0.001)
df["price"].fillna(-999,inplace=True)
df["image_top_1"].fillna(-999,inplace=True)

#Drop Cols
cols_to_drop = ["item_id", "user_id", "activation_date", "image"]
df.drop(cols_to_drop, axis=1,inplace=True)

In [None]:
# Text Features
df['text_feat'] = df.apply(lambda row: ' '.join([
    str(row['param_1']), 
    str(row['param_2']), 
    str(row['param_3'])]),axis=1) # Group Param Features


# Meta Text Features
textfeats = ["description","text_feat", "title"]
for cols in textfeats:
    df[cols] = df[cols].astype(str) 
    df[cols] = df[cols].astype(str).fillna('nicapotato') # FILL NA
    df[cols] = df[cols].str.lower() # Lowercase all text, so that capitalized words dont get treated differently
    df[cols + '_num_chars'] = df[cols].apply(len) # Count number of Characters
    df[cols + '_num_words'] = df[cols].apply(lambda comment: len(comment.split())) # Count number of Words
    df[cols + '_num_unique_words'] = df[cols].apply(lambda comment: len(set(w for w in comment.split())))
    df[cols + '_words_vs_unique'] = df[cols+'_num_unique_words'] / df[cols+'_num_words'] * 100 # Count Unique Words

In [None]:
print("\n[TF-IDF] Term Frequency Inverse Document Frequency Stage")
russian_stop = set(stopwords.words('russian'))

tfidf_para = {
    "stop_words": russian_stop,
    "analyzer": 'word',
    "token_pattern": r'\w{1,}',
    "sublinear_tf": True,
    "dtype": np.float32,
    "norm": 'l2',
    #"min_df":5,
    #"max_df":.9,
    "smooth_idf":False
}
def get_col(col_name): return lambda x: x[col_name]
vectorizer = FeatureUnion([
        ('description',TfidfVectorizer(
            ngram_range=(1, 2),
            max_features=16000,
            **tfidf_para,
            preprocessor=get_col('description'))),
        ('text_feat',CountVectorizer(
            ngram_range=(1, 2),
            #max_features=7000,
            preprocessor=get_col('text_feat'))),
        ('title',TfidfVectorizer(
            ngram_range=(1, 2),
            **tfidf_para,
            #max_features=7000,
            preprocessor=get_col('title')))
    ])
    
start_vect=time.time()
vectorizer.fit(df.loc[trainindex,:].to_dict('records'))
ready_df = vectorizer.transform(df.to_dict('records'))
tfvocab = vectorizer.get_feature_names()
print("Vectorization Runtime: %0.2f Minutes"%((time.time() - start_vect)/60))

# Drop Text Cols
df.drop(textfeats, axis=1,inplace=True)

In [None]:
train_X = hstack([csr_matrix(df.head(trainindex.shape[0]).values),ready_df[0:trainindex.shape[0]]]) # Sparse Matrix
test_X = hstack([csr_matrix(df.tail(testindex.shape[0]).values),ready_df[trainindex.shape[0]:]])
tfvocab = df.columns.tolist() + tfvocab
for shape in [train_X,test_X]:
    print("{} Rows and {} Cols".format(*shape.shape))
print("Feature Names Length: ",len(tfvocab))
del df
gc.collect();

In [None]:
def run_lgb(train_X, train_y, val_X, val_y, test_X):
    params = {
        "objective" : "regression",
        "metric" : "rmse",
        "num_leaves" : 30,
        "learning_rate" : 0.1,
        "bagging_fraction" : 0.7,
        "feature_fraction" : 0.7,
        "bagging_frequency" : 5,
        "bagging_seed" : 2018,
        "verbosity" : -1
    }
    
    lgtrain = lgb.Dataset(train_X, label=train_y, feature_name=tfvocab)
    lgval = lgb.Dataset(val_X, label=val_y, feature_name=tfvocab)
    evals_result = {}
    model = lgb.train(params, lgtrain, 1000, valid_sets=[lgval], early_stopping_rounds=100, verbose_eval=20, evals_result=evals_result)
    
    pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
    return pred_test_y, model, evals_result

In [None]:

# Splitting the data for model training#
X_train, X_val, y_train, y_val = train_test_split(train_X, train_y, test_size=0.2, random_state=42)


# Training the model #
pred_test, model, evals_result = run_lgb(X_train, y_train, X_val, y_val, test_X)

# Making a submission file #
pred_test[pred_test>1] = 1
pred_test[pred_test<0] = 0
sub_df = pd.DataFrame({"item_id":test_id})
sub_df["deal_probability"] = pred_test
sub_df.to_csv("baseline_lgb.csv", index=False)

In [None]:
fig, ax = plt.subplots(figsize=(12,18))
lgb.plot_importance(model, max_num_features=50, height=0.8, ax=ax)
ax.grid(False)
plt.title("LightGBM - Feature Importance", fontsize=15)
plt.show()