In [3]:
#Importing Liraries

# Importing Libraries
import warnings
warnings.filterwarnings("ignore")

from tqdm import tqdm
import shutil
import os
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib
matplotlib.use(u'nbAgg')
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pickle
import random
import joblib
from scipy.stats import randint as sp_randint
from scipy.stats import uniform
from scipy.sparse import hstack
from wordcloud import WordCloud


# Utilities
#from viz_utils import *
#from custom_transformers import *
#from ml_utils import *

# DataPrep
import re
from nltk.corpus import stopwords
from nltk.stem import RSLPStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import Normalizer

# Modeling

from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import SGDClassifier,LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, VotingClassifier, AdaBoostClassifier
#Metrics
from sklearn.metrics import log_loss,accuracy_score, confusion_matrix, f1_score





#Importing the Libraries
from tensorflow.keras.layers import LeakyReLU
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.models import load_model




In [4]:
import time
start = time.time()
data = pd.read_csv("olist_customers_dataset.csv")
geo_data = pd.read_csv("olist_geolocation_dataset.csv")
order_itemdata = pd.read_csv("olist_order_items_dataset.csv")
pay_data = pd.read_csv("olist_order_payments_dataset.csv")
rev_data = pd.read_csv("olist_order_reviews_dataset.csv")
orders = pd.read_csv("olist_orders_dataset.csv")
order_prddata = pd.read_csv("olist_products_dataset.csv")
order_selldata = pd.read_csv("olist_sellers_dataset.csv")
order_prd_catdata = pd.read_csv("product_category_name_translation.csv")
end = time.time()
print("reading time: ",(end-start),"sec")

reading time:  2.460000514984131 sec


In [5]:
#merging data
rev_new = rev_data.drop(['review_comment_title','review_creation_date','review_id','review_answer_timestamp'],axis=1)
df = pd.merge(orders,pay_data, on="order_id")
df = df.merge(data, on="customer_id")
df = df.merge(order_itemdata, on="order_id")
df = df.merge(order_prddata, on="product_id")
df = df.merge(order_prd_catdata, on="product_category_name")
df = df.merge(rev_new, on="order_id")


In [6]:
#sellers count for each product
fea_1= df.groupby('product_id').count()['seller_id']
fea_1_df = pd.DataFrame()
fea_1_df['product_id']= fea_1.index
fea_1_df['sellers_count']= fea_1.values
fea_1_df.head()    

Unnamed: 0,product_id,sellers_count
0,00066f42aeeb9f3007548bb9d3f33c38,1
1,00088930e925c41fd95ebfe695fd2655,1
2,0009406fd7479715e4bef61dd91f2462,1
3,000b8f95fcb9e0096488278317764d19,2
4,000d9be29b5207b54e86aa1b1ac54872,1


In [7]:
#sellers count for each product
fea_2 = df.groupby('order_id').count()['product_id']
fea_2_df = pd.DataFrame()
fea_2_df['order_id']= fea_2.index
fea_2_df['products_count']= fea_2.values
fea_2_df.head()

Unnamed: 0,order_id,products_count
0,00010242fe8c5a6d1ba2dd792cb16214,1
1,00018f77f2f0320c557190d7a144bdd3,1
2,000229ec398224ef6ca0657da4fc703e,1
3,00024acbcdf0a6daa1e931b038114c75,1
4,00042b26cf59d7ce69dfabb4e55b4fd9,1


In [8]:
# Adding the seller count and products count feature to the final data set
df = pd.merge(df,fea_1_df,on='product_id')
df = pd.merge(df,fea_2_df,on='order_id')


In [10]:
# separating the target variable
y = df['review_score']
X = df.drop(labels='review_score',axis=1)

# train test 80:20 split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,stratify=y,random_state=25)
print("Train data: ",X_train.shape,y_train.shape)
print("Train data: ",X_test.shape,y_test.shape)

Train data:  (93264, 34) (93264,)
Train data:  (23317, 34) (23317,)


In [11]:
#text data preprocessing
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import nltk
nltk.download('rslp')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package rslp to /root/nltk_data...
[nltk_data]   Package rslp is already up-to-date!


True

In [12]:
from gensim.models import FastText
ft_model = FastText.load_fasttext_format('/content/drive/MyDrive/case study/cc.pt.300.bin')
print(ft_model.wv['melhor'].shape)

(300,)


In [13]:
def tfidfWord2Vector(text,ft_words,tfidf_words,tf_values):
    # average Word2Vec
    # compute average word2vec for each review.
    tfidf_w2v_vectors = []; # the avg-w2v for each sentence/review is stored in this list
    for sentence in tqdm(text): # for each review/sentence
        vector = np.zeros(300) # as word vectors are of zero length
        tf_idf_weight =0; # num of words with a valid vector in the sentence/review
        for word in sentence.split(): # for each word in a review/sentence
            if (word in ft_words) and (word in tfidf_words):
                vec = ft_model.wv[word] # embeddings[word] 
                # here we are multiplying idf value(dictionary[word]) and the tf value((sentence.count(word)/len(sentence.split())))
                tf_idf = tf_values[word]*(sentence.count(word)/len(sentence.split())) # getting the tfidf value for each word
                vector += (vec * tf_idf) # calculating tfidf weighted w2v
                tf_idf_weight += tf_idf
        if tf_idf_weight != 0:
            vector /= tf_idf_weight
        tfidf_w2v_vectors.append(vector)
    tfidf_w2v_vectors = np.asarray(tfidf_w2v_vectors)
    
    return tfidf_w2v_vectors


In [14]:
#importing func.py 
from func import *

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package rslp to /root/nltk_data...
[nltk_data]   Package rslp is already up-to-date!


In [15]:
def preprocessing(X):
  #replacing null values
  X = replace_nan(X)

  #dedublication
  X,index = dedublicate(X)
  
  timestamp_col = ['order_purchase_timestamp','order_approved_at','order_delivered_customer_date',
                     'order_estimated_delivery_date']
  X[timestamp_col]= X[timestamp_col].apply(pd.to_datetime)
  #adding new features 
  X = feat_engg(X)
  #rfm_lvl = rfm_level()
  rfm = rfm_feat(X)
  X = X.merge(rfm ,on ='customer_unique_id',how='left')    

  #dropping columns     
  col= ['order_id','customer_id','order_purchase_timestamp','order_approved_at','order_delivered_customer_date',
  'order_estimated_delivery_date','customer_unique_id','order_item_id','product_id','seller_id','shipping_limit_date','f_quartile','r_quartile',
  'm_quartile','RFM_Score','RFM_Score_s','product_category_name']
  X.drop(columns=col,axis=1,inplace=True)

  #text preprocessing
  process_txt  = preprocess_text(X['review_comment_message'])
  X['review_comment_message'] = process_txt
  
  #TEXT featurization


  # encoding review comment message using Tfidf weighted W2V
  tfidf = TfidfVectorizer()
  tfidf.fit(X['review_comment_message'])
  
  # we are converting a dictionary with word as a key, and the idf as a value
  tf_values = dict(zip(tfidf.get_feature_names(), list(tfidf.idf_)))
  tfidf_words = set(tfidf.get_feature_names())
  ft_words = list(ft_model.wv.vocab.keys()) # list(embeddings.keys())
  tfidf_w2v_vectors_X = tfidfWord2Vector(X['review_comment_message'].values,ft_words,tfidf_words,tf_values)

  cat_col = ['order_status','payment_type','customer_state','product_category_name_english','RFM_Level']
  X_train = pickle.load(open('X_train.pkl','rb')) 
  cat_feat = cat_feats(cat_col,X_train,X)

  # numerical features
  num=['payment_sequential','payment_installments','payment_value','customer_zip_code_prefix','price',
  'freight_value','product_name_lenght','product_description_lenght','product_photos_qty',
  'product_weight_g','product_length_cm','product_height_cm','product_width_cm',
  'recency','frequency','monetary','sellers_count','products_count','est_delivery_t',
  'act_delivery_t','diff_in_delivery_t','on_time_delivery','avg_prdt_value','total_order_cost',
  'order_freight_ratio','purchase_dayofweek','is_reviewed','words_per_review','day_to_delivery']
   
  X_num = num_feats(X,num)
 
  #tokenization and pad_sequencing 
  textX = create_tokenizer(X_train['review_comment_message'],X['review_comment_message'])
  '''input = []
  for cat in cat_col:
    pad = create_tokenizer(X_train[cat],X[cat])
    input.append(pad)
  input = list(textX)+input'''
  trainX_os = create_tokenizer(X_train['order_status'],X['order_status'])
  trainX_pt = create_tokenizer(X_train['payment_type'],X['payment_type'])
  trainX_st = create_tokenizer(X_train['customer_state'],X['customer_state'])
  trainX_pc = create_tokenizer(X_train['product_category_name_english'],X['product_category_name_english'])
  trainX_rfm = create_tokenizer(X_train['RFM_Level'],X['RFM_Level'])
  x_train=[textX, trainX_os, trainX_pt, trainX_st, trainX_pc, trainX_rfm, X_num]
  return tfidf_w2v_vectors_X,cat_feat,X_num,textX,x_train,index

In [47]:
from scipy.sparse import hstack
def function_1(X):
    tfidf_w2v_vectors_X,cat_feat,X_num,textX,x_train,index = preprocessing(X)
    X_tr = hstack((tfidf_w2v_vectors_X,list(cat_feat.values())[0],list(cat_feat.values())[1],list(cat_feat.values())[2],
                    list(cat_feat.values())[3],list(cat_feat.values())[4],X_num)).tocsr()
    # load the model from file
    encoder = load_model('encoder.h5')

    # encode the train data
    X_encode = encoder.predict(X_tr)
    X_encode_1 = X_encode.reshape(X_encode.shape[0],X_encode.shape[1],1)


    # merge two sparse matrices: https://stackoverflow.com/a/19710648/4084039

    X_tr_num = hstack((list(cat_feat.values())[0],list(cat_feat.values())[1],list(cat_feat.values())[2],
                    list(cat_feat.values())[3],list(cat_feat.values())[4],X_num)).tocsr()

    x_tr_num = np.array(X_tr_num.todense()).reshape(X_tr_num.shape[0],X_tr_num.shape[1],1)
        
    #loading models
    model_1 = load_model('models/model_1.h5')
    model_2 = load_model('models/model_2.h5')
    model_3 = load_model('models/model_3.h5')
    model_4 = load_model('models/model_4.h5')
    model_5 = load_model('models/model_5.h5')

    #saving the model
    filename = '/content/drive/MyDrive/case study/models/stacknn2.sav'
    stacknn2 = joblib.load(filename)

    
    #prediction of train data
    y_pred1 = model_1.predict(X_encode)
    y_pred2 = model_2.predict(X_encode_1)
    y_pred3 = model_3.predict(x_train)
    y_pred4 = model_4.predict([textX,x_tr_num])
    y_pred5 = model_5.predict([textX,X_tr_num])
    

    y_pred = stacknn2.predict(np.stack((np.greater(y_pred1,0.5).astype(int)[:,0],
                                                np.greater(y_pred2,0.5).astype(int)[:,0],
                                                np.greater(y_pred3,0.5).astype(int)[:,0],
                                                np.greater(y_pred4,0.5).astype(int)[:,0],
                                                np.greater(y_pred5,0.5).astype(int)[:,0]),axis=-1))
    
    return y_pred,index

In [48]:
def function_2(X,y):
  y_pred,index = func_1(X)
  y = y.apply(lambda x:1 if x>3 else 0)
  y = y.drop(index=index,axis=0)
  return f1_score(y,y_pred,average='macro')

In [41]:
import logging
logging.getLogger('tensorflow').disabled = True

In [50]:
%%time
function_2(X_test,y_test)

100%|██████████| 22161/22161 [00:05<00:00, 3955.68it/s]
100%|██████████| 22161/22161 [29:47<00:00, 12.40it/s]


CPU times: user 31min 29s, sys: 16.3 s, total: 31min 45s
Wall time: 31min 9s


0.7766569587113161

In [51]:
%%time
function_2(X.iloc[300:1000],y.iloc[300:1000])

100%|██████████| 605/605 [00:00<00:00, 4046.39it/s]
100%|██████████| 605/605 [00:47<00:00, 12.64it/s]


CPU times: user 1min 1s, sys: 636 ms, total: 1min 2s
Wall time: 1min 1s


0.7484158415841584