In [341]:
# Libraries
from gensim.similarities import MatrixSimilarity
from TurkishStemmer import TurkishStemmer
from gensim import corpora

import pandas as pd
import gensim
import pprint
import string
import spacy
import re

In [342]:
NUMBERS = ["1", "2", "3", "4", "5", "6", "7", "8", "9"]

pd.options.display.max_rows = 9999
pd.options.display.max_columns = 9999
spacy_nlp = spacy.blank('tr')
stop_words = list(spacy_nlp.Defaults.stop_words)
pp = pprint.PrettyPrinter(indent=4)
punctuations = string.punctuation
stemmer = TurkishStemmer()

In [343]:
def filter(query):
  # Remove puncuations
  query = re.sub(r'[^\w\s]',' ',query)

  # Remove extra spaces
  query = re.sub(' +',' ',query)

  # Get tokens
  tokens = spacy_nlp(query)

  # Lower, strip and lemmatize
  tokens = [word.text.lower().strip() for word in tokens]


  # Remove stopwords, and exclude words less than 2 characters
  tokens = [ word for word in tokens if 
            (word not in stop_words) and 
            (word not in punctuations) and 
            (word in NUMBERS or len(word) >= 2) and 
            (word != "yok") and
            (word != "nan")]

  # Stem tokens - Does not works well :(
  # tokens = [stemmer.stem(word) for word in tokens]

  return tokens

In [344]:
def nan_detector(df, percentage):
  nans = df.isna().sum()
  maxm = len(df)*percentage
  result = []
  for j, val in enumerate(nans):
    if val > maxm:
      result.append(nans.keys()[j])
  return result

In [345]:
def unique_value_analysis(df, minm):
  uniques = df.nunique()
  result = []
  for j, val in enumerate(uniques):
    if val < minm:
      result.append(uniques.keys()[j])
  return result

In [346]:
def sepereate_num_cat(df):
  categorical = []
  numerical = []
  for feature in df.columns:
    if df[feature].dtype == object:
      categorical.append(feature)
    else:
      numerical.append(feature)
  return categorical, numerical

In [347]:
# Read DataFrame and convert all columnst to object type since we are dealing with NLP
df = pd.read_excel("./dataset/ebebek.xlsx")
df.shape

(226263, 31)

In [348]:
sepereate_num_cat(df)

(['STOK_ADI',
  'ANA_KATEGORI_ADI',
  'WEB_ANA_KATEGORI_TANIMI',
  'KATEGORI_ADI',
  'ALT_KATEGORI_ADI',
  'MARKA_ADI',
  'RENK_ADI',
  'BEDEN',
  'TEKSTIL_URUN_GRUBU_ADI',
  'TEKSTIL_ADET_ADI',
  'TEKSTIL_KOL_TIPI',
  'TEKSTIL_KOL_TIPI_TANIMI',
  'TEKSTIL_PACA_TIPI',
  'TEKSTIL_PACA_TIPI_TANIMI',
  'TEKSTIL_YAKA_TIPI_TANIMI',
  'WEB_UZUN_ACIKLAMA',
  'WEB_KISA_ACIKLAMA',
  'META_KEY_ACIKLAMA',
  'TEKNIK_SERVIS_ACIKLAMA',
  'MALZEME_UZUN_ACIKLAMA',
  'ASKI_DURUMU',
  'EN_MALZEME_UZUN_ACIKLAMA',
  'EN_STOK_ADI',
  'REYON_TANIM',
  'ANALIZ_KATEGORISI',
  'WEB_BIRINCI_KATEGORI_TANIMI',
  'WEB_IKINCI_KATEGORI_TANIMI',
  'WEB_UCUNCU_KATEGORI_TANIMI',
  'MAGAZA_KONUMU'],
 ['TEKSTIL_CINSIYET', 'TEKSTIL_YAKA_TIPI'])

In [349]:
# Find columns that includes more than %50 NaN values
nan_detector(df, .5)

['TEKSTIL_KOL_TIPI',
 'TEKSTIL_KOL_TIPI_TANIMI',
 'TEKSTIL_PACA_TIPI',
 'TEKSTIL_PACA_TIPI_TANIMI',
 'TEKSTIL_YAKA_TIPI',
 'TEKSTIL_YAKA_TIPI_TANIMI',
 'WEB_KISA_ACIKLAMA',
 'META_KEY_ACIKLAMA',
 'TEKNIK_SERVIS_ACIKLAMA',
 'EN_MALZEME_UZUN_ACIKLAMA',
 'REYON_TANIM']

In [350]:
uniques = unique_value_analysis(df, 10)
for key in uniques:
  pp.pprint(key)
  pp.pprint(df[key].value_counts())

'TEKSTIL_CINSIYET'
1045.0    81928
1044.0    76609
1046.0    43182
1049.0     4689
1047.0      249
1048.0        1
Name: TEKSTIL_CINSIYET, dtype: int64
'TEKSTIL_KOL_TIPI'
931    42797
930    31438
928    11197
929     3450
#YO      468
934        5
Name: TEKSTIL_KOL_TIPI, dtype: int64
'TEKSTIL_KOL_TIPI_TANIMI'
Uzun Kol     42797
Kısa Kol     31438
Atlet        11197
İp Askılı     3450
Name: TEKSTIL_KOL_TIPI_TANIMI, dtype: int64
'TEKSTIL_YAKA_TIPI'
1199.0    35697
1200.0    10483
1204.0     6144
1202.0     5929
1198.0     5605
1201.0     2202
1206.0     1602
1203.0     1320
1205.0      624
Name: TEKSTIL_YAKA_TIPI, dtype: int64
'TEKSTIL_YAKA_TIPI_TANIMI'
Bisiklet Yaka    35697
Çıtçıtlı Yaka    10483
Modelli Yaka      6144
Zarf Yaka         5929
Bebe Yaka         5605
Polo Yaka         2202
V Yaka            1602
Hakim Yaka        1320
Fırfırlı Yaka      624
Name: TEKSTIL_YAKA_TIPI_TANIMI, dtype: int64
'WEB_UZUN_ACIKLAMA'
X    140781
Name: WEB_UZUN_ACIKLAMA, dtype: int64
'WEB_KISA_ACIKLAM

TEKSTIL_KOL_TIPI and TEKSTIL_YAKA_TIPI will be dropped. Since they are numerical. <br>
WEB_UZUN_ACIKLAMA, WEB_KISA_ACIKLAMA, META_KEY_ACIKLAMA, TEKNIK_SERVIS_ACIKLAMA will be dropped since they only consist of 'X' values.<br>
CINSIYET will be replaced by the following,<br>
1044 -> Unisex 
1045 -> Kız 
1046 -> Erkek

In [351]:
dropped = [
 'TEKSTIL_KOL_TIPI',
 'TEKSTIL_KOL_TIPI_TANIMI',
 'TEKSTIL_PACA_TIPI',
 'TEKSTIL_PACA_TIPI_TANIMI',
 'TEKSTIL_YAKA_TIPI',
 'TEKSTIL_YAKA_TIPI_TANIMI',
 'WEB_KISA_ACIKLAMA',
 'META_KEY_ACIKLAMA',
 'TEKNIK_SERVIS_ACIKLAMA',
 'EN_MALZEME_UZUN_ACIKLAMA',
 'REYON_TANIM',
 'TEKSTIL_KOL_TIPI',
 'TEKSTIL_YAKA_TIPI',
 'WEB_UZUN_ACIKLAMA',
 'WEB_KISA_ACIKLAMA',
 'META_KEY_ACIKLAMA',
 'TEKNIK_SERVIS_ACIKLAMA'
 ]

In [352]:
# Drop unnecessary columns
df.drop(dropped, axis=1, inplace=True)
df.shape

(226263, 19)

In [353]:
# Convert CINSIYET columns to object
df["TEKSTIL_CINSIYET"] = df["TEKSTIL_CINSIYET"].replace([1044, 1045, 1046], ["unisex", "kız", "erkek"])

In [354]:
# Convert every column to string since we are dealing with NLP task
df.fillna("yok", inplace=True)
for feature in df.columns:
  df[feature] = df[feature].astype(str)

In [355]:
for j, feature in enumerate(df.columns):
  if not j:
    df["final_tokenized"] = df[feature].map(lambda x: filter(x))
  else:
    df["final_tokenized"] += df[feature].map(lambda x: filter(x))


In [356]:
keywords = df["final_tokenized"]

#creating term dictionary
dictionary = corpora.Dictionary(keywords)
dictionary.save("./models/dictionary")

# creating corpus
corpus = [dictionary.doc2bow(desc) for desc in keywords]

In [357]:
print("Dictionary {")
for i, value in enumerate(dictionary):
  print(f"          '{dictionary[value]}': {value}")
  if i > 20:
    break
print("}")

Dictionary {
          'boy': 0
          'büyük': 1
          'ebebek': 2
          'hazır': 3
          'hediye': 4
          'malzemeler': 5
          'masa': 6
          'paketi': 7
          'sarf': 8
          '040mm': 9
          '075mm': 10
          '75x100': 11
          'baskisiz': 12
          'baskılı': 13
          'bebek': 14
          'cm': 15
          'eco': 16
          'eti̇ket': 17
          'termal': 18
          'şeffaf': 19
          'ana': 20
          'araç': 21
}


In [358]:
for i in range(3):
  print(25*"*" + f" {i} " + 25*"*")
  print(f"document: {df.final_tokenized[i]}")
  print(f"corpus: {corpus[i]}")
  

************************* 0 *************************
document: ['hazır', 'büyük', 'boy', 'hediye', 'paketi', 'sarf', 'malzemeler', 'ebebek', 'hazır', 'büyük', 'boy', 'hediye', 'paketi', 'sarf', 'malzemeler', 'masa']
corpus: [(0, 2), (1, 2), (2, 1), (3, 2), (4, 2), (5, 2), (6, 1), (7, 2), (8, 2)]
************************* 1 *************************
document: ['075mm', '040mm', 'eco', 'termal', 'baskisiz', 'eti̇ket', 'sarf', 'malzemeler', 'sarf', 'malzemeler', 'sarf', 'malzemeler', 'sarf', 'malzemeler', 'bebek', '075mm', '040mm', 'eco', 'termal', 'baskisiz', 'eti̇ket', 'şeffaf', 'ebebek', 'baskılı', '75x100', 'cm', 'sarf', 'malzemeler', 'sarf', 'malzemeler', 'sarf', 'malzemeler']
corpus: [(2, 1), (5, 7), (8, 7), (9, 2), (10, 2), (11, 1), (12, 2), (13, 1), (14, 1), (15, 1), (16, 2), (17, 2), (18, 2), (19, 1)]
************************* 2 *************************
document: ['075mm', '075mm', 'eco', 'termal', 'baskisiz', 'eti̇ket', 'sarf', 'malzemeler', 'sarf', 'malzemeler', 'sarf', 'malz

<h2><center>Models</center></h2>
<h3>TF-IDF Model</h3>
  TF-IDF stands for term frequency - inverse document frequency. Term frequency means occurences of each words in a current document. Document frequency means occurences each word in whole document set. With the help these definitons we can express the TF-IDF score as follows:<br>

  $$ tf idf (t, d, D) = tf(t, d) \times \log ({n \over df(t)}) $$
  
<h3>LSI Model</h3>

  The basic idea behind LSI is to take advantage of impli- cit higher-order structure in the association of terms with documents (‘‘semantic structure”) in order to improve the detection of relevant documents, on the basis of terms found in queries.<br>
  LSI aims to find the best subspace approximation to the original document space in the sense of minimizing the global reconstruc- tion error (the difference of Frobenius norm between the original matrix and its approximation matrix). It is based on SVD (Singular Value Decomposition) and projects the document vectors into an approximated subspace, so that cosine similarity can accurately represent semantic similarity. (W. Zhang et al. / Expert Systems with Applications 38 (2011) 2758–2765)<br><br>
  
  Lets assume that A is a matrix that represents the tfidf scores of th terms. Than, if we apply SVD to the A matrix we can get result like on the below.
  $$ A = TSD^T $$
  Where T is the m by r term-concept vector matrix, S is the r by r singular values matrix, D is the n by r concept-document vector matrix


In [359]:
# Creating TFIDF and LSI models 
ebebek_tfidf_model = gensim.models.TfidfModel(corpus, id2word=dictionary, smartirs="npu")
ebebek_lsi_model = gensim.models.LsiModel(ebebek_tfidf_model[corpus], id2word=dictionary, num_topics=900)

# Saving model corpus
gensim.corpora.MmCorpus.serialize('./models/tfidf/ebebek_tfidf_corpus', ebebek_tfidf_model[corpus])
gensim.corpora.MmCorpus.serialize('./models/lsi/ebebek_lsi_corpus',ebebek_lsi_model[ebebek_tfidf_model[corpus]])

# Saving TFIDF and LSI models
ebebek_tfidf_model.save("./models/tfidf/ebebek_tfidf")
ebebek_lsi_model.save("./models/lsi/ebebek_lsi")

In [360]:
ebebek_tfidf_corpus = gensim.corpora.MmCorpus('./models/tfidf/ebebek_tfidf_corpus')
ebebek_lsi_corpus = gensim.corpora.MmCorpus('./models/lsi/ebebek_lsi_corpus')

malzeme_index = MatrixSimilarity(ebebek_lsi_corpus, num_features = ebebek_lsi_corpus.num_terms)
malzeme_index.save("./models/malzeme_index")

In [361]:
pp.pprint("TF-IDF Scores")
for i, token in enumerate(ebebek_tfidf_corpus):
  print(25*"*" + f" {i} " + 25*"*")
  print(token)
  if i > 0:
    break

'TF-IDF Scores'
************************* 0 *************************
[(0, 0.5851315519583672), (1, 1.1105291555344703), (2, 0.6407592953544803), (3, 1.7461007583504555), (4, 1.4491962501042694), (5, 1.3104070859882164), (6, 0.0327673502533431), (7, 1.155335354306532), (8, 1.3104070859882164)]
************************* 1 *************************
[(2, 0.6042884629225614), (5, 4.325373995156828), (8, 4.325373995156828), (9, 1.911041437869605), (10, 1.8444507475906713), (11, 0.7664361909564749), (12, 1.7052952185560455), (13, 0.24676303279393713), (15, 0.4196437096088453), (16, 1.3925312464125104), (17, 1.7605558623017357), (18, 1.0570252475665314), (19, 0.6152600004612219)]


In [362]:
pp.pprint("LSI Scores - 226263 X 1000")
for i, token in enumerate(ebebek_lsi_corpus):
  print(25*"*" + f" {i} " + 25*"*")
  print(token[:10])
  if i > -1:
    break

'LSI Scores - 226263 X 1000'
************************* 0 *************************
[(0, 0.005497907040379719), (1, 0.01713535089712121), (2, 0.0003765940284761198), (3, 0.0027794364312258733), (4, 0.00700475856654261), (5, 0.010134953330622235), (6, -0.0017751325824238513), (7, 0.012523135802566018), (8, 0.0039051499052488565), (9, 0.007873681465036095)]


In [363]:
pp.pprint("Matrix Similarity Results - 226263 X 226263")
for i, val in enumerate(malzeme_index):
  print(25*"*" + f" {i} " + 25*"*")
  print(val)
  if i > -1:
    break

'Matrix Similarity Results - 226263 X 226263'
************************* 0 *************************
[ 0.9999998   0.874396    0.8735253  ...  0.00638833 -0.00280114
 -0.00125104]


In [364]:
malzeme_index.num_best = 3
malzeme_index[[(0, .4), (1, .23)]]

[(142022, 0.31501534581184387),
 (212071, 0.31501534581184387),
 (107907, 0.3147602081298828)]