In [105]:
import pandas as pd
import numpy as np

import nltk
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Switch area
SUBJECT = 'material_garment'
VOCAB_SIZE = 1250

In [106]:
BOM = 'DataSource/AB18MJ1_032_BOM.xlsx'
Matrix = 'Matrix/' + SUBJECT + '_matrix.csv'
VOCAB = 'DataSource/' + SUBJECT + '_vocabulary.csv'
TRAIN_DATA_0 = 'Trainning Data/' + SUBJECT + '_prob_tokens_ctg_0_in_train_data'
TRAIN_DATA_1 = 'Trainning Data/' + SUBJECT + '_prob_tokens_ctg_1_in_train_data'

In [107]:
TRAIN_DATA_0

'Trainning Data/material_garment_prob_tokens_ctg_0_in_train_data'

In [108]:
vocab = pd.read_csv(VOCAB, index_col = 0)

In [109]:
bom = pd.read_excel(BOM, index_col=None, header=None)

In [110]:
bom.shape

(67, 19)

* Delete the none columns 

In [111]:
def delete_col(bom):
    for col in bom:
        if bom[col].count() == 0:
            bom = bom.drop(col, axis = 1)
        
    return bom

In [112]:
bom = delete_col(bom)

In [113]:
bom

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,AIRBLASTER,,,,,Style Number: AB18MJ1_032,,,,
1,,,,,,Designer: JG,,,,
2,,,,,,Season: Winter 2017/18,,,,
3,,,Insulation:,100gm in the body hood sleeves,,Collection: Freedom,,,,
4,Style Name:,,Seams:,Critically Taped,,Vendor: Soluna,,,,
...,...,...,...,...,...,...,...,...,...,...
62,3,ZIPPER DIRECTION:,"CF has right hand slider, pocket zippers close...",,,,,,,
63,4,BARTACK:,At stress points,,,,,,,
64,5,HEAT CUT:,"Elastic, webbing, taffeta",,,,,,,
65,6,GROMMETS/SNAPS:,Should have pellon backing,,,,,,,


## Nltk stemmered Function

In [114]:
def stemmered_nltk_convert(col_of_df):
    '''
    Parameter of this function is a column of a dataFrame.
    
    '''
    # difine Stop words
    stop_words = set(stopwords.words('english'))
    # Difine Stemmer
    stemmer = SnowballStemmer('english')
    
    # converts to lower case and splits up the words
    words = word_tokenize(col_of_df)
    filtered_words = []
    
    for word in words:
        # Removes the stop words and punctuation
        # if word is not in the stop_words list and is not a alpha.
        if word not in stop_words and word.isalpha():
            filtered_words.append(stemmer.stem(word))
            
    return filtered_words

## The classify function
* Check each row and return the index, if it is classified as True

In [115]:
def classify_series(series):
    index_list = []
    for word in series:
        x = [word]
        judge = vectorizer.transform(x)
        if  classifier.predict(judge) == 1:
            index_list.append(series[series == word].index[0])
        
    return index_list## Walk through bom

In [116]:
bom.index

RangeIndex(start=0, stop=67, step=1)

## Walk through bom
* Parse a xlsm of bom, turn the row into a col.
* Put the col to the classify function
* Return the index of row that is True

In [117]:
def check_index(bom):
    '''
    
    The parameter of the func is a dataFrame
    
    '''
    database = []
    
    for row in bom.index:
        row_str = str()
        for col in bom:
            row_str = row_str + ', ' + str(bom.at[row, col])
        database.append(row_str)
    
    col = pd.Series(database)
#     index_list = classify_series(col)
    
    return col

* Turn the bom into series contains of string list

In [118]:
new_bom = check_index(bom)

In [119]:
new_bom

0     , AIRBLASTER, nan, nan, nan, nan, Style Number...
1     , nan, nan, nan, nan, nan, Designer:   JG, nan...
2     , nan, nan, nan, nan, nan, Season: Winter 2017...
3     , nan, nan, Insulation:, 100gm in the body hoo...
4     , Style Name:, nan, Seams:, Critically Taped, ...
                            ...                        
62    , 3, ZIPPER DIRECTION:, CF has right hand slid...
63    , 4, BARTACK:, At stress points, nan, nan, nan...
64    , 5, HEAT CUT:, Elastic, webbing, taffeta, nan...
65    , 6, GROMMETS/SNAPS:, Should have pellon backi...
66    , 7, THREAD COLOR:, Matches fabric color, unle...
Length: 67, dtype: object

In [120]:
type(new_bom)

pandas.core.series.Series

* Convert the series of bom into Stemmed bom

In [121]:
%%time
stemmed_bom = new_bom.apply(stemmered_nltk_convert)

CPU times: user 66.3 ms, sys: 6.29 ms, total: 72.6 ms
Wall time: 76.4 ms


In [122]:
stemmed_bom[54]

['yy', 'hangtag', 'tbd', 'tbd', 'tbd', 'tbd', 'tbd', 'tbd']

* Convert the Stemmed bom into df

In [123]:
word_col_df = pd.DataFrame.from_records(stemmed_bom.tolist())
word_col_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,airblast,,,,,style,number,,,,...,,,,,,,,,,
1,,,,,,design,jg,,,,...,,,,,,,,,,
2,,,,,,season,winter,,,,...,,,,,,,,,,
3,,,insul,bodi,hood,sleev,,collect,freedom,,...,,,,,,,,,,
4,style,name,,seam,critic,tape,,vendor,soluna,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62,zipper,direct,cf,right,hand,slider,pocket,zipper,close,vent,...,down,,,,,,,,,
63,bartack,at,stress,point,,,,,,,...,,,,,,,,,,
64,heat,cut,elast,web,taffeta,,,,,,...,,,,,,,,,,
65,should,pellon,back,,,,,,,,...,,,,,,,,,,


### Sparse Matrix Function
* Create a sparse Matrix for the data we want to predict
* The difference of this function in comparition with Classification Model for Train data, is this function don't need CATEGORY.

In [124]:
def make_sparse_matrix(df, vocabulary):
    """
    Param1:
    The data we want to sparse, which must be in format of DataFrame.
    Param2:
    The vocabulary, it is generated when we training datas.
    
    Returns a sparse matrix as dataframe
    """
 
    indexed_words = pd.Index(vocabulary.VOCAB_WORD)
    nr_rows = df.shape[0]
    nr_cols = df.shape[1]
    word_set = set(indexed_words)
    dict_list = []
    
    for i in range(nr_rows):
        for j in range(nr_cols):
            
            word = df.iat[i, j]
            if word in word_set:
                doc_id = df.index[i]
                word_id = indexed_words.get_loc(word)
                
                item = {'MATERIAL_ID': doc_id,
                       'OCCURENCE': 1, 'WORD_ID': word_id}
                
                dict_list.append(item)
                
    return pd.DataFrame(dict_list)

* Sparse the data we want to predict

In [125]:
%%time
sparse_predict_df = make_sparse_matrix(word_col_df, vocab)

CPU times: user 19.4 ms, sys: 794 µs, total: 20.2 ms
Wall time: 22.5 ms


In [126]:
sparse_predict_df

Unnamed: 0,MATERIAL_ID,OCCURENCE,WORD_ID
0,0,1,316
1,0,1,38
2,0,1,3
3,1,1,174
4,2,1,81
...,...,...,...
605,66,1,235
606,66,1,14
607,66,1,93
608,66,1,370


* Grouped by MATERIAL_ID

In [127]:
sparse_predict_df_grouped = sparse_predict_df.groupby(['MATERIAL_ID', 'WORD_ID']).sum()
sparse_predict_df_grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,OCCURENCE
MATERIAL_ID,WORD_ID,Unnamed: 2_level_1
0,3,1
0,38,1
0,316,1
1,174,1
2,81,1
...,...,...
66,73,1
66,93,2
66,235,1
66,333,1


In [128]:
sparse_predict_df_grouped = sparse_predict_df_grouped.reset_index()
sparse_predict_df_grouped

Unnamed: 0,MATERIAL_ID,WORD_ID,OCCURENCE
0,0,3,1
1,0,38,1
2,0,316,1
3,1,174,1
4,2,81,1
...,...,...,...
465,66,73,1
466,66,93,2
467,66,235,1
468,66,333,1


In [129]:
np.savetxt('Trainning Data/' + SUBJECT + '_sparse_predict_data', sparse_predict_df_grouped, fmt = '%d')

In [130]:
PREDICT_DATA = 'Trainning Data/' + SUBJECT + '_sparse_predict_data'

In [131]:
sparse_predict_data = np.loadtxt(PREDICT_DATA, delimiter = ' ', dtype = int)

## Full Matrix
* Since we want to predict the data, so we create the Full Feature directly.

In [132]:
def make_full_feature(sparse_matrix, nr_words, doc_idx = 0, word_idx = 1, freq_idx = 2):
    column_names = ['MATERIAL_ID'] + list(range(0, VOCAB_SIZE))
    doc_id_names = np.unique(sparse_matrix[:,0])
    full_matrix = pd.DataFrame(index = doc_id_names, columns = column_names)
    full_matrix.fillna(value=0, inplace=True)
    
    for i in range(sparse_matrix.shape[0]):
        doc_nr = sparse_matrix[i][doc_idx]
        word_id = sparse_matrix[i][word_idx]
        occurrence = sparse_matrix[i][freq_idx]
        
        full_matrix.at[doc_nr, 'MATERIAL_ID'] = doc_nr
        full_matrix.at[doc_nr, word_id] = occurrence
        
    full_matrix.set_index('MATERIAL_ID', inplace = True)
    return full_matrix

In [133]:
%%time
predict_full_feature = make_full_feature(sparse_predict_data, VOCAB_SIZE)

CPU times: user 189 ms, sys: 3.73 ms, total: 192 ms
Wall time: 195 ms


In [134]:
predict_full_feature

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,1240,1241,1242,1243,1244,1245,1246,1247,1248,1249
MATERIAL_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62,1,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
63,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
64,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
65,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


# Joint probability

In [135]:
train_data_1 = np.loadtxt(TRAIN_DATA_1)
train_data_0 = np.loadtxt(TRAIN_DATA_0)

In [136]:
joint_prob_1 = predict_full_feature.dot(train_data_1)
joint_prob_0 = predict_full_feature.dot(train_data_0)

In [137]:
prediction = joint_prob_1 > joint_prob_0

In [138]:
row_list = prediction[prediction == True].index

In [139]:
row_list

Int64Index([ 3, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21, 22, 23, 25, 26, 27, 28,
            29, 30, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
            48, 49, 50, 53, 54, 55, 56, 57, 62, 64, 65],
           dtype='int64', name='MATERIAL_ID')

In [140]:
bom.loc[row_list,:]

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9
MATERIAL_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
3,,,Insulation:,100gm in the body hood sleeves,,Collection: Freedom,,,,
11,A,Shell Fabric,SOTD109-C1-Y 100% Polyester 10K/8K 10K/8K +2-L...,Main body,,VINTAGE BLACK,,BLACK,DARK NAVY,AB18MJ1_032
12,B,Shell Fabric,SOTD109-PR01-Y 100% Polyester 10K/8K 10K/8K +2...,Main body,,,DINOFLAGE,,,AB18MJ1_032
13,C,Shell Fabric,SOTP069-C1-H 100% Polyester 10K/8K 15k/10k+2-L...,"Sleeves, hood",,BLACK,BLACK,OXBLOOD,CAMEL,AB18MJ1_032
14,D,Lining,210T-E EMB07 Tery Emboss Taffeta(Supplied by ...,"Inside lining, powder skirt, pocket bag",,BLACK,BLACK,BLACK,BLACK,AB18MJ1_032
15,E,Other,Velvety Tricot - SOTO048-1 (supplied by Solis),"Comfort chin guard, inner front pocket bag panel",,BLACK,BLACK,BLACK,BLACK,AB18MJ1_032
16,F,Other,100% poly MESH - SOTC025-1 (supplied by Solis),"vents, internal goggle pocket",,BLACK,BLACK,BLACK,BLACK,AB18MJ1_032
17,G,Other,nylon/elastic lycra - SONO029-1 (supplied by S...,powder skirt,,BLACK,BLACK,BLACK,BLACK,AB18MJ1_032
19,I,Other,Lycra binding tape,Internal mesh pocket opening,,BLACK,BLACK,BLACK,BLACK,AB18MJ1_032
20,J,Other,Gripper elastic,powder skirt,,BLACK,BLACK,BLACK,BLACK,AB18MJ1_032
