In [365]:
import pandas as pd
import numpy as np

import nltk
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Switch area
SUBJECT = 'material_garment'
VOCAB_SIZE = 1200

## Select Vocab and Train_data

In [366]:
VOCAB = 'DataSource/' + SUBJECT + '_vocabulary.csv'
TRAIN_DATA_0 = 'Trainning Data/' + SUBJECT + '_prob_tokens_ctg_0_in_train_data'
TRAIN_DATA_1 = 'Trainning Data/' + SUBJECT + '_prob_tokens_ctg_1_in_train_data'

In [367]:
TRAIN_DATA_0

'Trainning Data/material_garment_prob_tokens_ctg_0_in_train_data'

In [368]:
vocab = pd.read_csv(VOCAB, index_col = 0)

## Nltk stemmered Function

In [369]:
def stemmered_nltk_convert(col_of_df):
    '''
    Parameter of this function is a column of a dataFrame.
    
    '''
    # difine Stop words
    stop_words = set(stopwords.words('english'))
    # Difine Stemmer
    stemmer = SnowballStemmer('english')
    
    # converts to lower case and splits up the words
    words = word_tokenize(col_of_df)
    filtered_words = []
    
    for word in words:
        # Removes the stop words and punctuation
        # if word is not in the stop_words list and is not a alpha.
        if word not in stop_words and word.isalpha():
            filtered_words.append(stemmer.stem(word))
            
    return filtered_words

## Input the cell to check

In [370]:
a = 'AIRBLASTER Style Number: AB18MJ1_032'
b = 'SOTD109-C1-Y 100% Polyester 10K/8K 10K/8K +2-Layer +breathable coating+DWR (supplied by SOLIS)'
c = 'YKK No.8 Vislon 2a way  zipper.  Auto-lock slider DA8LH, Silders bottom to bottom   (Right hand pull left hand insert)'
d = '100gm Insulation'
e = '1x1 rib knit,good quality Arcylic/Spandex'
words = [a, b, c, d, e]

In [371]:
words

['AIRBLASTER Style Number: AB18MJ1_032',
 'SOTD109-C1-Y 100% Polyester 10K/8K 10K/8K +2-Layer +breathable coating+DWR (supplied by SOLIS)',
 'YKK No.8 Vislon 2a way  zipper.  Auto-lock slider DA8LH, Silders bottom to bottom   (Right hand pull left hand insert)',
 '100gm Insulation',
 '1x1 rib knit,good quality Arcylic/Spandex']

In [372]:
words_series = pd.Series(words)

In [373]:
type(words_series)

pandas.core.series.Series

In [374]:
words_series

0                 AIRBLASTER Style Number: AB18MJ1_032
1    SOTD109-C1-Y 100% Polyester 10K/8K 10K/8K +2-L...
2    YKK No.8 Vislon 2a way  zipper.  Auto-lock sli...
3                                     100gm Insulation
4            1x1 rib knit,good quality Arcylic/Spandex
dtype: object

* Convert the series of bom into Stemmed bom

In [375]:
%%time
stemmed_bom = words_series.apply(stemmered_nltk_convert)

CPU times: user 4.96 ms, sys: 4.49 ms, total: 9.45 ms
Wall time: 8.98 ms


In [376]:
stemmed_bom

0                            [airblast, style, number]
1                              [polyest, suppli, soli]
2    [ykk, vislon, way, zipper, slider, silder, bot...
3                                              [insul]
4                           [rib, knit, good, qualiti]
dtype: object

* Convert the Stemmed bom into df

In [377]:
# word_col_df = pd.DataFrame.from_records(stemmed_bom.tolist())
# word_col_df

In [378]:
word_col_df = pd.DataFrame.from_records(stemmed_bom)
word_col_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,airblast,style,number,,,,,,,,,,,
1,polyest,suppli,soli,,,,,,,,,,,
2,ykk,vislon,way,zipper,slider,silder,bottom,bottom,right,hand,pull,left,hand,insert
3,insul,,,,,,,,,,,,,
4,rib,knit,good,qualiti,,,,,,,,,,


### Sparse Matrix Function
* Create a sparse Matrix for the data we want to predict
* The difference of this function in comparition with Classification Model for Train data, is this function don't need CATEGORY.

In [379]:
def make_sparse_matrix(df, vocabulary):
    """
    Param1:
    The data we want to sparse, which must be in format of DataFrame.
    Param2:
    The vocabulary, it is generated when we training datas.
    
    Returns a sparse matrix as dataframe
    """
 
    indexed_words = pd.Index(vocabulary.VOCAB_WORD)
    nr_rows = df.shape[0]
    nr_cols = df.shape[1]
    word_set = set(indexed_words)
    dict_list = []
    
    for i in range(nr_rows):
        for j in range(nr_cols):
            
            word = df.iat[i, j]
            if word in word_set:
                doc_id = df.index[i]
                word_id = indexed_words.get_loc(word)
                
                item = {'MATERIAL_ID': doc_id,
                       'OCCURENCE': 1, 'WORD_ID': word_id}
                
                dict_list.append(item)
                
    return pd.DataFrame(dict_list)

* Sparse the data we want to predict

In [380]:
%%time
sparse_predict_df = make_sparse_matrix(word_col_df, vocab)

CPU times: user 3.78 ms, sys: 760 µs, total: 4.54 ms
Wall time: 4.56 ms


In [381]:
sparse_predict_df

Unnamed: 0,MATERIAL_ID,OCCURENCE,WORD_ID
0,0,1,391
1,0,1,72
2,0,1,11
3,1,1,44
4,1,1,32
5,1,1,99
6,2,1,31
7,2,1,147
8,2,1,180
9,2,1,1


* Grouped by MATERIAL_ID

In [382]:
sparse_predict_df_grouped = sparse_predict_df.groupby(['MATERIAL_ID', 'WORD_ID']).sum()
sparse_predict_df_grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,OCCURENCE
MATERIAL_ID,WORD_ID,Unnamed: 2_level_1
0,11,1
0,72,1
0,391,1
1,32,1
1,44,1
1,99,1
2,1,1
2,5,1
2,12,2
2,14,1


In [383]:
sparse_predict_df_grouped = sparse_predict_df_grouped.reset_index()
sparse_predict_df_grouped

Unnamed: 0,MATERIAL_ID,WORD_ID,OCCURENCE
0,0,11,1
1,0,72,1
2,0,391,1
3,1,32,1
4,1,44,1
5,1,99,1
6,2,1,1
7,2,5,1
8,2,12,2
9,2,14,1


In [384]:
np.savetxt('Trainning Data/' + SUBJECT + '_sparse_predict_data', sparse_predict_df_grouped, fmt = '%d')

In [385]:
PREDICT_DATA = 'Trainning Data/' + SUBJECT + '_sparse_predict_data'

In [386]:
sparse_predict_data = np.loadtxt(PREDICT_DATA, delimiter = ' ', dtype = int)

## Full Matrix
* Since we want to predict the data, so we create the Full Feature directly.

In [387]:
def make_full_feature(sparse_matrix, nr_words, doc_idx = 0, word_idx = 1, freq_idx = 2):
    column_names = ['MATERIAL_ID'] + list(range(0, VOCAB_SIZE))
    doc_id_names = np.unique(sparse_matrix[:,0])
    full_matrix = pd.DataFrame(index = doc_id_names, columns = column_names)
    full_matrix.fillna(value=0, inplace=True)
    
    for i in range(sparse_matrix.shape[0]):
        doc_nr = sparse_matrix[i][doc_idx]
        word_id = sparse_matrix[i][word_idx]
        occurrence = sparse_matrix[i][freq_idx]
        
        full_matrix.at[doc_nr, 'MATERIAL_ID'] = doc_nr
        full_matrix.at[doc_nr, word_id] = occurrence
        
    full_matrix.set_index('MATERIAL_ID', inplace = True)
    return full_matrix

In [388]:
%%time
predict_full_feature = make_full_feature(sparse_predict_data, VOCAB_SIZE)

CPU times: user 151 ms, sys: 7.02 ms, total: 158 ms
Wall time: 158 ms


In [389]:
predict_full_feature

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,1190,1191,1192,1193,1194,1195,1196,1197,1198,1199
MATERIAL_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Joint probability

In [390]:
train_data_1 = np.loadtxt(TRAIN_DATA_1)
train_data_0 = np.loadtxt(TRAIN_DATA_0)

In [391]:
joint_prob_1 = predict_full_feature.dot(train_data_1)
joint_prob_0 = predict_full_feature.dot(train_data_0)

In [392]:
joint_prob_1

MATERIAL_ID
0    0.001731
1    0.015897
2    0.113199
3    0.002634
4    0.002413
dtype: float64

In [393]:
joint_prob_0

MATERIAL_ID
0    0.091707
1    0.000195
2    0.024976
3    0.014504
4    0.000260
dtype: float64

In [394]:
prediction = joint_prob_1 > joint_prob_0

In [395]:
prediction

MATERIAL_ID
0    False
1     True
2     True
3    False
4     True
dtype: bool