In [191]:
import pandas as pd
import numpy as np

import nltk
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Switch area
SUBJECT = 'fabric'
VOCAB_SIZE = 900

# Select Vocab and Train_data

In [319]:
VOCAB = 'DataSource/' + SUBJECT + '_vocabulary.csv'
TRAIN_DATA_0 = 'Trainning Data/' + SUBJECT + '_prob_tokens_ctg_0_in_train_data'
TRAIN_DATA_1 = 'Trainning Data/' + SUBJECT + '_prob_tokens_ctg_1_in_train_data'

In [320]:
TRAIN_DATA_0

'Trainning Data/fabric_prob_tokens_ctg_0_in_train_data'

In [321]:
vocab = pd.read_csv(VOCAB, index_col = 0)

## Nltk stemmered Function

In [322]:
def stemmered_nltk_convert(col_of_df):
    '''
    Parameter of this function is a column of a dataFrame.
    
    '''
    # difine Stop words
    stop_words = set(stopwords.words('english'))
    # Difine Stemmer
    stemmer = SnowballStemmer('english')
    
    # converts to lower case and splits up the words
    words = word_tokenize(col_of_df)
    filtered_words = []
    
    for word in words:
        # Removes the stop words and punctuation
        # if word is not in the stop_words list and is not a alpha.
        if word not in stop_words and word.isalpha():
            filtered_words.append(stemmer.stem(word))
            
    return filtered_words

## Input the cell to check

In [569]:
a = 'Gripper elastic, powder skirt'
b = 'SOTD109-C1-Y 100% Polyester 10K/8K 10K/8K +2-Layer +breathable coating+DWR (supplied by SOLIS)'
c = 'YKK No.8 Vislon 2a way  zipper.  Auto-lock slider DA8LH, Silders bottom to bottom   (Right hand pull left hand insert)'
d = '100gm Insulation'
words = [a, b, c, d]

In [572]:
words

['Gripper elastic, powder skirt',
 'SOTD109-C1-Y 100% Polyester 10K/8K 10K/8K +2-Layer +breathable coating+DWR (supplied by SOLIS)',
 'YKK No.8 Vislon 2a way  zipper.  Auto-lock slider DA8LH, Silders bottom to bottom   (Right hand pull left hand insert)',
 '100gm Insulation']

In [573]:
type(words[0])

str

In [574]:
words_series = pd.Series(words)

In [575]:
type(words_series)

pandas.core.series.Series

In [576]:
words_series

0                        Gripper elastic, powder skirt
1    SOTD109-C1-Y 100% Polyester 10K/8K 10K/8K +2-L...
2    YKK No.8 Vislon 2a way  zipper.  Auto-lock sli...
3                                     100gm Insulation
dtype: object

* Convert the series of bom into Stemmed bom

In [577]:
%%time
stemmed_bom = words_series.apply(stemmered_nltk_convert)

CPU times: user 4.12 ms, sys: 273 µs, total: 4.4 ms
Wall time: 4.45 ms


In [578]:
stemmed_bom

0                      [gripper, elast, powder, skirt]
1                              [polyest, suppli, soli]
2    [ykk, vislon, way, zipper, slider, silder, bot...
3                                              [insul]
dtype: object

* Convert the Stemmed bom into df

In [579]:
# word_col_df = pd.DataFrame.from_records(stemmed_bom.tolist())
# word_col_df

In [580]:
word_col_df = pd.DataFrame.from_records(stemmed_bom)
word_col_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,gripper,elast,powder,skirt,,,,,,,,,,
1,polyest,suppli,soli,,,,,,,,,,,
2,ykk,vislon,way,zipper,slider,silder,bottom,bottom,right,hand,pull,left,hand,insert
3,insul,,,,,,,,,,,,,


### Sparse Matrix Function
* Create a sparse Matrix for the data we want to predict
* The difference of this function in comparition with Classification Model for Train data, is this function don't need CATEGORY.

In [554]:
def make_sparse_matrix(df, vocabulary):
    """
    Param1:
    The data we want to sparse, which must be in format of DataFrame.
    Param2:
    The vocabulary, it is generated when we training datas.
    
    Returns a sparse matrix as dataframe
    """
 
    indexed_words = pd.Index(vocabulary.VOCAB_WORD)
    nr_rows = df.shape[0]
    nr_cols = df.shape[1]
    word_set = set(indexed_words)
    dict_list = []
    
    for i in range(nr_rows):
        for j in range(nr_cols):
            
            word = df.iat[i, j]
            if word in word_set:
                doc_id = df.index[i]
                word_id = indexed_words.get_loc(word)
                
                item = {'MATERIAL_ID': doc_id,
                       'OCCURENCE': 1, 'WORD_ID': word_id}
                
                dict_list.append(item)
                
    return pd.DataFrame(dict_list)

* Sparse the data we want to predict

In [555]:
%%time
sparse_predict_df = make_sparse_matrix(word_col_df, vocab)

CPU times: user 2.32 ms, sys: 76 µs, total: 2.39 ms
Wall time: 2.35 ms


In [556]:
sparse_predict_df

Unnamed: 0,MATERIAL_ID,OCCURENCE,WORD_ID
0,0,1,131
1,0,1,35
2,0,1,43
3,0,1,40
4,1,1,36
5,1,1,26
6,1,1,86
7,2,1,24
8,2,1,129
9,2,1,159


* Grouped by MATERIAL_ID

In [557]:
sparse_predict_df_grouped = sparse_predict_df.groupby(['MATERIAL_ID', 'WORD_ID']).sum()
sparse_predict_df_grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,OCCURENCE
MATERIAL_ID,WORD_ID,Unnamed: 2_level_1
0,35,1
0,40,1
0,43,1
0,131,1
1,26,1
1,36,1
1,86,1
2,1,1
2,4,1
2,10,2


In [558]:
sparse_predict_df_grouped = sparse_predict_df_grouped.reset_index()
sparse_predict_df_grouped

Unnamed: 0,MATERIAL_ID,WORD_ID,OCCURENCE
0,0,35,1
1,0,40,1
2,0,43,1
3,0,131,1
4,1,26,1
5,1,36,1
6,1,86,1
7,2,1,1
8,2,4,1
9,2,10,2


In [559]:
np.savetxt('Trainning Data/' + SUBJECT + '_sparse_predict_data', sparse_predict_df_grouped, fmt = '%d')

In [560]:
PREDICT_DATA = 'Trainning Data/' + SUBJECT + '_sparse_predict_data'

In [561]:
sparse_predict_data = np.loadtxt(PREDICT_DATA, delimiter = ' ', dtype = int)

## Full Matrix
* Since we want to predict the data, so we create the Full Feature directly.

In [562]:
def make_full_feature(sparse_matrix, nr_words, doc_idx = 0, word_idx = 1, freq_idx = 2):
    column_names = ['MATERIAL_ID'] + list(range(0, VOCAB_SIZE))
    doc_id_names = np.unique(sparse_matrix[:,0])
    full_matrix = pd.DataFrame(index = doc_id_names, columns = column_names)
    full_matrix.fillna(value=0, inplace=True)
    
    for i in range(sparse_matrix.shape[0]):
        doc_nr = sparse_matrix[i][doc_idx]
        word_id = sparse_matrix[i][word_idx]
        occurrence = sparse_matrix[i][freq_idx]
        
        full_matrix.at[doc_nr, 'MATERIAL_ID'] = doc_nr
        full_matrix.at[doc_nr, word_id] = occurrence
        
    full_matrix.set_index('MATERIAL_ID', inplace = True)
    return full_matrix

In [563]:
%%time
predict_full_feature = make_full_feature(sparse_predict_data, VOCAB_SIZE)

CPU times: user 130 ms, sys: 2.07 ms, total: 132 ms
Wall time: 133 ms


In [564]:
predict_full_feature

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,890,891,892,893,894,895,896,897,898,899
MATERIAL_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Joint probability

In [565]:
train_data_1 = np.loadtxt(TRAIN_DATA_1)
train_data_0 = np.loadtxt(TRAIN_DATA_0)

In [566]:
joint_prob_1 = predict_full_feature.dot(train_data_1)
joint_prob_0 = predict_full_feature.dot(train_data_0)

In [567]:
prediction = joint_prob_1 > joint_prob_0

In [568]:
prediction

MATERIAL_ID
0    False
1     True
2    False
3     True
dtype: bool