# Import nltk tools

In [3764]:
import pandas as pd
import numpy as np

import nltk
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Switch area
SUBJECT = 'label'
VOCAB_SIZE = 900
TEST_SIZE = 0.1 # proportion, the size of test_set

# Wrod processing

In [3765]:
data = pd.read_csv('result/Matrix/' + SUBJECT + '_matrix.csv')

* Name the index as "MATERIAL_IDs"
* The matrix must has it unrepeated Id before import to here, show as the "Unnamed: 0".

In [3766]:
data.head()

Unnamed: 0.1,Unnamed: 0,material,label
0,0,", , lm792, shell fabric at hood, above chest, ...",0
1,1,", , sotl085-pu01, shell fabric at hood, above ...",0
2,2,", , lm792, shell fabric at mid chest panels, ,...",0
3,3,", , sotl085-pu01, shell fabric at mid chest pa...",0
4,4,", , ym11-3527, stretch mesh , , , , back pocke...",0


* Change the col "Unnamed: 0" as "MATERIAL_ID"

In [3767]:
data.rename( columns = {'Unnamed: 0': 'MATERIAL_ID'}, inplace = True)

In [3768]:
data.head()

Unnamed: 0,MATERIAL_ID,material,label
0,0,", , lm792, shell fabric at hood, above chest, ...",0
1,1,", , sotl085-pu01, shell fabric at hood, above ...",0
2,2,", , lm792, shell fabric at mid chest panels, ,...",0
3,3,", , sotl085-pu01, shell fabric at mid chest pa...",0
4,4,", , ym11-3527, stretch mesh , , , , back pocke...",0


* Set the original index as Index

In [3769]:
data.set_index('MATERIAL_ID', inplace = True)

In [3770]:
data.tail()

Unnamed: 0_level_0,material,label
MATERIAL_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
14583,", , Spring Toggle, Spring Toggles/2Hole, , , ,",0
14584,", , Retro Tape, Retro Tape/W:30mm, , , ,",0
14585,", , Velcro, Velcro-Hook, , , ,",0
14586,", , Velcro, Velcro-Loop, , , ,",0
14587,", , Seam Tape, , , , ,",0


## Nltk Function

In [3771]:
def nltk_convert(col_of_df):
    '''
    Parameter of this function is a column of a dataFrame.
    
    '''
    # difine Stop words
    stop_words = set(stopwords.words('english'))
    # Difine Stemmer
    stemmer = SnowballStemmer('english')
    
    # converts to lower case and splits up the words
    words = word_tokenize(col_of_df)
    filtered_words = []
    
    for word in words:
        # Removes the stop words and punctuation
        # if word is not in the stop_words list and is not a alpha.
        if word not in stop_words and word.isalpha():
#             filtered_words.append(stemmer.stem(word))
            filtered_words.append(word)
            
    return filtered_words

* Test the function, with row 0, in column 'material'

In [3772]:
nltk_convert(data.at[0, 'material'])

['shell',
 'fabric',
 'hood',
 'chest',
 'lower',
 'front',
 'body',
 'back',
 'body',
 'sleeves',
 'facing',
 'fabric',
 'windflap',
 'hood',
 'shockcord',
 'tunnel',
 'self',
 'fabric',
 'seam',
 'binding',
 'needed',
 'shockcord',
 'tunnel']

## Applying to the matrix
* the variable "nested_list" will hold the result.
* It is a series in this moment.

In [3773]:
%%time
nested_list = data.material.apply(nltk_convert)

CPU times: user 4.02 s, sys: 351 ms, total: 4.37 s
Wall time: 4.41 s


In [3774]:
type(nested_list)

pandas.core.series.Series

## True and False categories
* Exact the index of fabric materials, and non fabric materials

In [3775]:
material_ids_ctg_1 = data[data[SUBJECT] == 1].index
material_ids_ctg_0 = data[data[SUBJECT] ==0].index

In [3776]:
material_ids_ctg_1

Int64Index([   20,    21,    22,    41,    43,    44,    59,    61,    62,
               88,
            ...
            14414, 14418, 14425, 14433, 14435, 14438, 14447, 14452, 14460,
            14465],
           dtype='int64', name='MATERIAL_ID', length=1080)

### Get materials back by index
* Exact the strings in col 'material' by the index of doc_ids_fabric

In [3777]:
nested_list_ctg_1 = nested_list.loc[material_ids_ctg_1]

In [3778]:
nested_list_ctg_1

MATERIAL_ID
20       [care, content, labels, interior, left, side, ...
21       [care, content, labels, interior, left, side, ...
22       [po, label, po, bulk, interior, left, side, seam]
41                   [main, interior, label, cb, neckseam]
43               [care, content, labels, left, side, seam]
                               ...                        
14438    [SIZE, LABEL, SAMPLE, SIZE, L, SIZE, LABEL, XS...
14447    [SIZE, LABEL, SAMPLE, SIZE, L, SIZE, LABEL, XS...
14452    [SIZE, LABEL, SAMPLE, SIZE, L, SIZE, LABEL, XS...
14460    [SIZE, LABEL, SAMPLE, SIZE, L, SIZE, LABEL, XS...
14465    [SIZE, LABEL, SAMPLE, SIZE, L, SIZE, LABEL, XS...
Name: material, Length: 1080, dtype: object

* by the index of doc_ids_trim

In [3779]:
nested_list_ctg_0 = nested_list.loc[material_ids_ctg_0]

In [3780]:
nested_list_ctg_0.shape[0]

13508

## Number of Distinct words
* The Ctg_1

In [3781]:
flat_list_ctg_1 =[item for sublist in nested_list_ctg_1 for item in sublist]

In [3782]:
ctg_1_words = pd.Series(flat_list_ctg_1).value_counts()

In [3783]:
ctg_1_words.shape

(284,)

* The Ctg_0

In [3784]:
flat_list_ctg_0 = [item for sublist in nested_list_ctg_0 for item in sublist]

In [3785]:
ctg_0_words = pd.Series(flat_list_ctg_0).value_counts()

In [3786]:
ctg_0_words.head()

pocket       1789
NUMBER       1435
zipper       1348
LOCATION     1204
REFERENCE    1182
dtype: int64

# Generate Vocabulary
## Top 10 words in Matrix (ctg_1 and ctg_0)

* It is same as function the nltk_convert, the difference is in the end of the funciton, it not append word directly, but word converted by function stemmer.stem.

In [3787]:
def stemmered_nltk_convert(col_of_df):
    '''
    Parameter of this function is a column of a dataFrame.
    
    '''
    # difine Stop words
    stop_words = set(stopwords.words('english'))
    # Difine Stemmer
    stemmer = SnowballStemmer('english')
    
    # converts to lower case and splits up the words
    words = word_tokenize(col_of_df)
    filtered_words = []
    
    for word in words:
        # Removes the stop words and punctuation
        # if word is not in the stop_words list and is not a alpha.
        if word not in stop_words and word.isalpha():
            filtered_words.append(stemmer.stem(word))
            
    return filtered_words

In [3788]:
%%time
stemmed_nested_list = data.material.apply(stemmered_nltk_convert)

CPU times: user 5.12 s, sys: 322 ms, total: 5.44 s
Wall time: 5.46 s


* Get flat list for all words

In [3789]:
flat_stemmed_nested_list = [item for sublist in stemmed_nested_list for item in sublist]

* Turn it to a series

In [3790]:
unique_words = pd.Series(flat_stemmed_nested_list).value_counts()

In [3791]:
unique_words

pocket      2328
zipper      1920
label       1894
seam        1673
number      1576
            ... 
gulid          1
inlin          1
tepe           1
determin       1
sanfu          1
Length: 1265, dtype: int64

* Set the size of vacabulary
* This size need to be tested a lot, in this cace, follow the e-mail processing lecture, I takes 10 % of total particular words

In [3792]:
VOCAB_SIZE

900

In [3793]:
frequent_words = unique_words[0:VOCAB_SIZE]

In [3794]:
print('Most common words: \n', frequent_words[:10])

Most common words: 
 pocket    2328
zipper    1920
label     1894
seam      1673
number    1576
locat     1563
refer     1261
front     1250
back      1141
left      1109
dtype: int64


## Create Vocabulary with WORD_ID

In [3795]:
word_ids = list(range(0, VOCAB_SIZE))
vocab = pd.DataFrame({'VOCAB_WORD': frequent_words.index.values}, index = word_ids)
vocab.index.name = 'WORD_ID'
vocab.head()

Unnamed: 0_level_0,VOCAB_WORD
WORD_ID,Unnamed: 1_level_1
0,pocket
1,zipper
2,label
3,seam
4,number


* Save the vocabulary

In [3796]:
vocab.to_csv('result/Trained Data/Vocabulary/'+SUBJECT+'_vocabulary.csv')

* Test the any word to see if it is in the vocabulary

In [3797]:
'hangtag' in set(vocab.VOCAB_WORD)

True

## Search out the material with most token

In [3798]:
clean_material_lengths = [len(sublist) for sublist in stemmed_nested_list]
print('Nr wrods in the longest material:', max(clean_material_lengths))

Nr wrods in the longest material: 71


In [3799]:
print('Material position in the list', np.argmax(clean_material_lengths))

Material position in the list 10164


In [3800]:
stemmed_nested_list[np.argmax(clean_material_lengths)]

['local',
 'sourc',
 'item',
 'must',
 'compli',
 'with',
 'cpsia',
 'test',
 'lead',
 'third',
 'parti',
 'must',
 'contain',
 'less',
 'ppm',
 'lead',
 'test',
 'result',
 'must',
 'submit',
 'approv',
 'sierra',
 'design',
 'develop',
 'team',
 'must',
 'submit',
 'sierra',
 'design',
 'apparel',
 'develop',
 'team',
 'approv',
 'prior',
 'use',
 'trim',
 'card',
 'pleas',
 'submit',
 'complet',
 'trim',
 'card',
 'prior',
 'sms',
 'product',
 'approv',
 'item',
 'substitut',
 'sms',
 'must',
 'clear',
 'note',
 'trim',
 'card',
 'pleas',
 'submit',
 'complet',
 'trim',
 'card',
 'prior',
 'bulk',
 'product',
 'approv',
 'this',
 'trim',
 'card',
 'must',
 'repres',
 'actual',
 'bulk',
 'materi']

# Generate Features & Spare Matrix
### Creating a DataFrame with one word per column

In [3801]:
type(stemmed_nested_list)

pandas.core.series.Series

In [3802]:
stemmed_nested_list

MATERIAL_ID
0        [shell, fabric, hood, chest, lower, front, bod...
1        [shell, fabric, hood, chest, lower, front, bod...
2                       [shell, fabric, mid, chest, panel]
3                       [shell, fabric, mid, chest, panel]
4                       [stretch, mesh, back, pocket, bag]
                               ...                        
14583                              [spring, toggl, spring]
14584                                 [retro, tape, retro]
14585                                             [velcro]
14586                                             [velcro]
14587                                         [seam, tape]
Name: material, Length: 14588, dtype: object

In [3803]:
type(stemmed_nested_list.tolist())

list

In [3804]:
word_col_df = pd.DataFrame.from_records(stemmed_nested_list.tolist())
word_col_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,61,62,63,64,65,66,67,68,69,70
0,shell,fabric,hood,chest,lower,front,bodi,back,bodi,sleev,...,,,,,,,,,,
1,shell,fabric,hood,chest,lower,front,bodi,back,bodi,sleev,...,,,,,,,,,,
2,shell,fabric,mid,chest,panel,,,,,,...,,,,,,,,,,
3,shell,fabric,mid,chest,panel,,,,,,...,,,,,,,,,,
4,stretch,mesh,back,pocket,bag,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14583,spring,toggl,spring,,,,,,,,...,,,,,,,,,,
14584,retro,tape,retro,,,,,,,,...,,,,,,,,,,
14585,velcro,,,,,,,,,,...,,,,,,,,,,
14586,velcro,,,,,,,,,,...,,,,,,,,,,


## Traning Set and Test Set
* Must import the method "train_test_split()" from sklearn

In [3805]:
X_train, X_test, y_train, y_test = train_test_split(word_col_df, data[SUBJECT],
                                                    test_size = TEST_SIZE)

In [3806]:
print('Nr of traning samples', X_train.shape[0])
print('Fraction of traning set', X_train.shape[0] / word_col_df.shape[0])

Nr of traning samples 13129
Fraction of traning set 0.8999862901014533


In [3807]:
X_train.index.name = X_test.index.name = 'MATERIAL_ID'
X_train.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,61,62,63,64,65,66,67,68,69,70
MATERIAL_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10483,airblast,style,number,,,,,,,,...,,,,,,,,,,
13609,trim,refer,number,locat,amount,,,,,,...,,,,,,,,,,
5956,thread,ref,number,,,,,,,,...,,,,,,,,,,
9652,thread,top,stitch,thread,from,coatsopti,,,,,...,,,,,,,,,,
10280,trim,refer,number,locat,,,,,,,...,,,,,,,,,,


In [3808]:
y_train.head()

MATERIAL_ID
10483    0
13609    0
5956     0
9652     0
10280    0
Name: label, dtype: int64

## Sparse Matrix for the traning data
* Get vocabulary turned as an Index:

In [3809]:
word_index = pd.Index(vocab.VOCAB_WORD)
word_index

Index(['pocket', 'zipper', 'label', 'seam', 'number', 'locat', 'refer',
       'front', 'back', 'left',
       ...
       'pig', 'version', 'text', 'newsiz', 'wiastband', 'printabl', 'info',
       'waydu', 'corey', 'neat'],
      dtype='object', name='VOCAB_WORD', length=900)

In [3810]:
y_train.shape[0]

13129

### Sparse Matrix Function

In [3811]:
def make_sparse_matrix(df, indexed_words, labels):
    """
    Returns a sparse matrix as dataframe
    """
    
    nr_rows = df.shape[0]
    nr_cols = df.shape[1]
    word_set = set(indexed_words)
    dict_list = []
    
    for i in range(nr_rows):
        for j in range(nr_cols):
            
            word = df.iat[i, j]
            if word in word_set:
                doc_id = df.index[i]
                word_id = indexed_words.get_loc(word)
                category = labels.at[doc_id]
                
                item = {'LABEL': category, 'MATERIAL_ID': doc_id,
                       'OCCURENCE': 1, 'WORD_ID': word_id}
                
                dict_list.append(item)
                
    return pd.DataFrame(dict_list)

## Training data
### Applying to the traning set

In [3812]:
%%time
sparse_train_df = make_sparse_matrix(X_train, word_index, y_train)

CPU times: user 5.24 s, sys: 33.1 ms, total: 5.27 s
Wall time: 5.31 s


In [3813]:
sparse_train_df

Unnamed: 0,LABEL,MATERIAL_ID,OCCURENCE,WORD_ID
0,0,10483,1,317
1,0,10483,1,38
2,0,10483,1,4
3,0,13609,1,17
4,0,13609,1,6
...,...,...,...,...
80552,0,1489,1,169
80553,0,1489,1,133
80554,0,1489,1,20
80555,0,1489,1,7


In [3814]:
sparse_train_df.shape

(80557, 4)

### Group by the DOC_ID

In [3815]:
train_grouped = sparse_train_df.groupby(['MATERIAL_ID', 'WORD_ID', 'LABEL']).sum()
train_grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,OCCURENCE
MATERIAL_ID,WORD_ID,LABEL,Unnamed: 3_level_1
0,3,0,1
0,7,0,1
0,8,0,1
0,10,0,2
0,14,0,3
...,...,...,...
14583,877,0,2
14585,42,0,1
14586,42,0,1
14587,3,0,1


* Distribute the DOC_ID to each row

In [3816]:
train_grouped = train_grouped.reset_index()

In [3817]:
train_grouped

Unnamed: 0,MATERIAL_ID,WORD_ID,LABEL,OCCURENCE
0,0,3,0,1
1,0,7,0,1
2,0,8,0,1
3,0,10,0,2
4,0,14,0,3
...,...,...,...,...
75437,14583,877,0,2
75438,14585,42,0,1
75439,14586,42,0,1
75440,14587,3,0,1


### Save Traning Data
* We save it as a txt file, with only numbers of the IDs, Occurances, label, as showed above. 

In [3818]:
np.savetxt('result/Trained Data/' + SUBJECT + '_sparse_traning_data', train_grouped, fmt = '%d')

## Test Data
* The way to create it is basically same way as treating traning data.

In [3819]:
X_test.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,61,62,63,64,65,66,67,68,69,70
MATERIAL_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8100,item,brand,trim,,,,,,,,...,,,,,,,,,,
1913,shell,fabric,polyest,terri,jacquard,bond,fleec,provid,soli,main,...,,,,,,,,,,
471,bodi,fabric,use,main,fabric,bodi,,,,,...,,,,,,,,,,
8066,squar,rivet,black,alloy,left,hand,pocket,bottom,edg,,...,,,,,,,,,,
11176,fabric,supplier,developp,marjori,,,,,,,...,,,,,,,,,,


In [3820]:
y_test.head()

MATERIAL_ID
8100     0
1913     0
471      0
8066     0
11176    0
Name: label, dtype: int64

In [3821]:
%%time
sparse_test_df = make_sparse_matrix(X_test, word_index, y_test)

CPU times: user 666 ms, sys: 6.37 ms, total: 673 ms
Wall time: 684 ms


In [3822]:
test_grouped = sparse_test_df.groupby(['MATERIAL_ID', 'WORD_ID', 'LABEL']).sum().reset_index()
test_grouped.head()

Unnamed: 0,MATERIAL_ID,WORD_ID,LABEL,OCCURENCE
0,6,1,0,1
1,6,9,0,1
2,6,18,0,1
3,6,24,0,1
4,6,102,0,1


In [3823]:
np.savetxt('result/Trained Data/' + SUBJECT + '_sparse_test_data', test_grouped, fmt = '%d')

# Full Matrix
* Create anempty df first

### Columns of Full Matrix
* The empty df with column 'DOC_ID', in the case I use 'MATERIAL_ID'
* Second column is 'CATEGORY'
* The rest column is number from 0 to the number of words in vocabulary.

In [3824]:
column_names = ['MATERIAL_ID'] + ['CATEGORY'] + list(range(0, VOCAB_SIZE))

### Rows of Full Matrix
* If consists of Material_ID in Training Data
* Numpy method np.unique() will take a number 1 time, even it occurs many time in the Traning Data.
* Import the traning data we saved as txt

In [3825]:
TRANING_DATA = 'result/Trained Data/' + SUBJECT + '_sparse_traning_data'

* Open it by Numpy

In [3826]:
sparse_train_data = np.loadtxt(TRANING_DATA, delimiter = ' ', dtype=int)

In [3827]:
index_names = np.unique(sparse_train_data[:, 0])

### Form the Data Frame
* Create an empty df, columned by column_names, indexed by index_names
* Fullfill all the cell with value "0"

In [None]:
full_train_data = pd.DataFrame(index= index_names, columns = column_names)
full_train_data.fillna(value = 0, inplace = True)
full_train_data.shape

## Make a Full matrix function

In [None]:
def make_full_matrix(sparse_matrix, nr_words, doc_idx = 0, word_idx = 1, cat_idx = 2, freq_idx = 3):
    column_names = ['MATERIAL_ID'] + ['CATEGORY'] + list(range(0, VOCAB_SIZE))
    doc_id_names = np.unique(sparse_matrix[:,0])
    full_matrix = pd.DataFrame(index = doc_id_names, columns = column_names)
    full_matrix.fillna(value=0, inplace=True)
    
    for i in range(sparse_matrix.shape[0]):
        doc_nr = sparse_matrix[i][doc_idx]
        word_id = sparse_matrix[i][word_idx]
        label = sparse_matrix[i][cat_idx]
        occurrence = sparse_matrix[i][freq_idx]
        
        full_matrix.at[doc_nr, 'MATERIAL_ID'] = doc_nr
        full_matrix.at[doc_nr, 'CATEGORY'] = label
        full_matrix.at[doc_nr, word_id] = occurrence
        
    full_matrix.set_index('MATERIAL_ID', inplace = True)
    return full_matrix

In [None]:
%%time
full_train_data = make_full_matrix(sparse_train_data, VOCAB_SIZE)

In [None]:
full_train_data

# Probability of Category_1 in train_data

In [None]:
full_train_data.CATEGORY.size

In [None]:
full_train_data.CATEGORY.sum()

In [None]:
prob_ctg_1 = full_train_data.CATEGORY.sum() / full_train_data.CATEGORY.size
print('Probability  of ' + SUBJECT +' is', prob_ctg_1)

In [None]:
prob_ctg_1

In [None]:
prob_ctg_1_in_train_data = pd.DataFrame({'prob_ctg_1_train_set':[prob_ctg_1]})

In [None]:
prob_ctg_1_in_train_data.to_csv('result/Trained Data/Train_set/' + SUBJECT + '_prob_ctg_1_in_train_data')

# Count each material in train_data have how many tokens
## Full train Features
* It is very simple. The whole df of train_data without the column 'CATEGORY' is the Full train Features
* Create a new df with no column 'CATEGORY'

In [None]:
full_train_features = full_train_data.loc[:, full_train_data.columns != 'CATEGORY']

In [None]:
full_train_features.head()

In [None]:
# full_train_features[1249].sum()

### Number of tokens each material has in train_data
* We added up all values in row direction, to get sum of each row 

In [None]:
material_train_data = full_train_features.sum(axis = 1) 
material_train_data.shape # Show how many materials in train_data

In [None]:
material_train_data # Show how many words each material in train_data has

* Total number of words in train_data

In [None]:
total_words_train_data = material_train_data.sum()
total_words_train_data

#### Token of ctg_1 in train_data
* Number of particular words of category 1, in train_data
* The word can be called "token"

In [None]:
ctg_1_in_train_data = material_train_data[full_train_data.CATEGORY == 1]
ctg_1_in_train_data.shape
# 1379 materials in train_data is fabric

In [None]:
ctg_1_token_in_train_data = ctg_1_in_train_data.sum()
ctg_1_token_in_train_data
# The total words in Category 1, the fabric, in the train_data

#### Token of trim in train_data
* Number of particular words of category 0, the trim, in train_data

In [None]:
ctg_0_in_train_data = material_train_data[full_train_data.CATEGORY == 0]
ctg_0_in_train_data.shape 
# Total, 5026 materials in train_data is trim

In [None]:
ctg_0_token_in_train_data = ctg_0_in_train_data.sum()
ctg_0_token_in_train_data 
# Total words in Category 0, the trim, in the train_data

### Average number of words in fabric and trim
* In this case, CATEGORY 1 is fabric, 0 is trim

In [None]:
print('Average nr of words in ctg_1 {:.0f}'.format(ctg_1_token_in_train_data / ctg_1_in_train_data.shape[0]))
print('Average nr of words in ctg_0 {:.0f}'.format(ctg_0_token_in_train_data / ctg_0_in_train_data.shape[0]))

In [None]:
# The snippet is for testing if the calculating get problem.
# The answer should eaquls to 0
material_train_data.shape[0] - ctg_1_in_train_data.shape[0] - ctg_0_in_train_data.shape[0]

# Count each token's frequency in both Category
## Full train Features of ctg_1
* It is very simple. The df of train_data in 'CATEGORY'1, and without the column 'CATEGORY' is the Full train Features of Fabric

In [None]:
ctg_1_full_train_features = full_train_features.loc[full_train_data.CATEGORY == 1]

In [None]:
ctg_1_full_train_features

## Full train feature of trim

In [None]:
ctg_0_full_train_features = full_train_features.loc[full_train_data.CATEGORY == 0]

In [None]:
ctg_0_full_train_features

### Frequency of tokens in fabric in train_data
* We add up all value in column direction to get sum of each column.
* The sum represents each word occurs how many time in the category 1, the fabric.
* It is possible that some words in vocabulary never appear in this category, so we need "+ 1" to this token. Since later we'll divide it to get the percentage, if you divide 0, it may cause error.

In [None]:
summed_ctg_1_tokens_in_train_data = ctg_1_full_train_features.sum(axis = 0) + 1

In [None]:
summed_ctg_1_tokens_in_train_data

### Frequency of tokens in trim in train_data
* The way treat it as treating with fabric

In [None]:
summed_ctg_0_tokens_in_train_data = ctg_0_full_train_features.sum(axis = 0) + 1

In [None]:
summed_ctg_0_tokens_in_train_data

# Probability of tokens
* Why here not only divided the frequency of each token by total tokens but also the size of vocabulary? It is becasue we add "+ 1" for each word when counting frequency, so add the size of vocabulary is for balancing the number back.

## P of Training Set 
### Probability of tokens in fabric in train_data
* ### P( Token | ctg_1 )
* Take the each summed up occurance divided by the total nubmer of token in category of fabric.

In [None]:
prob_tokens_ctg_1_train_set = summed_ctg_1_tokens_in_train_data / (ctg_1_token_in_train_data + VOCAB_SIZE)

In [None]:
ctg_1_token_in_train_data

In [None]:
prob_tokens_ctg_1_train_set

### Probability of tokens in trim in train_data
* ### P( Token | ctg_0 )

In [None]:
prob_tokens_ctg_0_train_set = summed_ctg_0_tokens_in_train_data / (ctg_0_token_in_train_data + VOCAB_SIZE)

In [None]:
prob_tokens_ctg_0_train_set

* Check if the number correct.
* The answer should be "1"

In [None]:
prob_tokens_ctg_0_train_set.sum()

* Test for the prob

In [None]:
check = prob_tokens_ctg_1_train_set[196] > prob_tokens_ctg_0_train_set[196]
check

### Probability of tokens in train_data
* ### P (Token)

In [None]:
prob_tokens_all_train_set = full_train_features.sum(axis = 0) / total_words_train_data

In [None]:
prob_tokens_all_train_set

In [None]:
prob_tokens_all_train_set.sum()

### Save the trained Model
* P( Token | ctg_1 )
* P( Token | ctg_0 )
* P ( Token )

In [None]:
np.savetxt('result/Trained Data/Train_set/' + SUBJECT + '_prob_tokens_ctg_1_in_train_data' ,prob_tokens_ctg_1_train_set)
np.savetxt('result/Trained Data/Train_set/' + SUBJECT + '_prob_tokens_ctg_0_in_train_data', prob_tokens_ctg_0_train_set)
np.savetxt('result/Trained Data/Train_set/' + SUBJECT + '_prob_tokens_all_in_train_data', prob_tokens_all_train_set)

## P of Test Set
### Probability of tokens in trim in train_data
* ### P( Token | ctg_0 )

* Start from Sparse Matrix of test_set
* Trun the Sparse Matrix to Full Matrix
* Get the number of word of material by summing each row of Full Matrix
* Get the frequency of each word by summing each column of Full Matrix
* Get all this result from test set : P ( Token | ctg_1 ), P (Token | ctg_0 ), P ( Token ) 
 

### Full matrix of test data
#### Import test data Sparse Matrix

In [None]:
TEST_DATA = 'result/Trained Data/' + SUBJECT + '_sparse_test_data'
sparse_test_data = np.loadtxt(TEST_DATA, delimiter = ' ', dtype = int )

#### Create a empty Full Matrix
* Column and row of Full Matrix

In [None]:
column_names = ['MATERIAL_ID'] + ['CATEGORY'] + list(range(0, VOCAB_SIZE))
index_names = np.unique(sparse_test_data[:, 0]) # 1 material_id only for 1 row, without duplicated

#### The custom function make_full_matrix
* empty dataFrame for Full matrix 
* Filled the values from sparse matrix by column match to WORD_ID, the row match to MATERIAL_ID

In [None]:
%%time
full_test_data = make_full_matrix(sparse_test_data, VOCAB_SIZE) 

In [None]:
full_test_data

### Probability in test data
#### Ctg_1

In [None]:
full_test_data.CATEGORY.size

In [None]:
full_test_data.CATEGORY.sum()

In [None]:
prob_ctg_1_test_data = full_test_data.CATEGORY.sum() / full_test_data.CATEGORY.size
print('Probability of ctg_1 in test_set', prob_ctg_1_test_data)

### Full Features of test_set

In [None]:
full_test_features = full_test_data.loc[:, full_test_data.columns != 'CATEGORY']

In [None]:
full_test_features

#### Number of tokens each material has in test_data

In [None]:
material_test_data = full_test_features.sum(axis = 1)
material_test_data[:5]

* Total number of words in test_data

In [None]:
total_words_test_data = material_test_data.sum()
total_words_test_data

#### Token of ctg_1 in test_data

In [None]:
ctg_1_in_test_data = material_test_data[full_test_data.CATEGORY == 1]
ctg_1_in_test_data.shape

In [None]:
ctg_1_token_in_test_data = ctg_1_in_test_data.sum()
ctg_1_token_in_test_data

#### Token of ctg_0 in test_data

In [None]:
ctg_0_in_test_data = material_test_data[full_test_data.CATEGORY == 0]
ctg_0_in_test_data.shape

In [None]:
ctg_0_token_in_test_data = ctg_0_in_test_data.sum()
ctg_0_token_in_test_data

#### Average number of words in fabric and trim in test_data

In [None]:
print('Average nr of words in ctg_1 {:.0f}'.format(ctg_1_token_in_test_data / ctg_1_in_test_data.shape[0]))
print('Average nr of words in ctg_0 {:.0f}'.format(ctg_0_token_in_test_data / ctg_0_in_test_data.shape[0]))

### Frequency of tokens in test_data
* Full feature of ctg_1 in test_data

In [None]:
ctg_1_full_test_features = full_test_features.loc[full_test_data.CATEGORY == 1]
ctg_1_full_test_features

* Full feature of ctg_0 in test_data

In [None]:
ctg_0_full_test_features = full_test_features.loc[full_test_data.CATEGORY == 0]
ctg_0_full_test_features

#### Frequency of tokens in ctg_1 in test_data

In [None]:
summed_ctg_1_tokens_in_test_data = ctg_1_full_test_features.sum(axis = 0) +1 
summed_ctg_1_tokens_in_test_data

#### Frequency of tokens in ctg_0 in test_data

In [None]:
summed_ctg_0_tokens_in_test_data = ctg_0_full_test_features.sum(axis = 0) +1 
summed_ctg_0_tokens_in_test_data

## P of Test Set
* ### P( Token | ctg_1 )

In [None]:
prob_tokens_ctg_1_test_set = summed_ctg_1_tokens_in_test_data / (ctg_1_token_in_test_data + VOCAB_SIZE)

In [None]:
prob_tokens_ctg_1_test_set

* ### P( Token | ctg_0 )

In [None]:
prob_tokens_ctg_0_test_set = summed_ctg_0_tokens_in_test_data / (ctg_0_token_in_test_data + VOCAB_SIZE)

In [None]:
prob_tokens_ctg_0_test_set

* ### P( Token )

In [None]:
prob_tokens_all_test_set = full_test_features.sum(axis = 0) / total_words_test_data

In [None]:
prob_tokens_all_test_set

In [None]:
prob_tokens_all_test_set.sum()

### Save the test Model
* P( Token | ctg_1 )
* P( Token | ctg_0 )
* P ( Token )

In [None]:
np.savetxt('result/Trained Data/Test_set/' + SUBJECT + '_prob_tokens_ctg_1_in_test_data' ,prob_tokens_ctg_1_test_set)
np.savetxt('result/Trained Data/Test_set/' + SUBJECT + '_prob_tokens_ctg_0_in_test_data', prob_tokens_ctg_0_test_set)
np.savetxt('result/Trained Data/Test_set/' + SUBJECT + '_prob_tokens_all_in_test_data', prob_tokens_all_test_set)

# Joint probability
* Get the the full feature of test_set
* Get full target of test_set
* Take each cell in Full features of test_set times the probability of fabric in train_set

### Full feature of test_set

In [None]:
full_test_features = full_test_data.loc[:, full_test_data.columns != 'CATEGORY']
full_test_features

### Full target of test_set

In [None]:
full_test_target = full_test_data.CATEGORY
full_test_target

### Probability of ctg_1 of Test_set
* The frequancy of tokens in test_set x The probability of tokens in train_set

In [None]:
prob_tokens_ctg_1_train_set

In [None]:
joint_pty = full_test_features.dot(prob_tokens_ctg_1_train_set)
joint_pty

# Prior
* Prior is the percentage of event that we get from the datas.
* If we want to count the fabric, Prior in this case is the probability of fabric.
* numbers of fabric / total material.

P( ctg_1 | documents )  = 
P( documentsl | ctg_1 ) * P( ctg_1) / P( document )


# Joint probability in log format
* Ctg_1 joint prob in log format

In [None]:
joint_log_ctg_1 = full_test_features.dot(np.log(prob_tokens_ctg_1_train_set) - np.log(prob_tokens_all_train_set)) + np.log(prob_ctg_1) 

In [None]:
joint_log_ctg_1[:5]

* Ctg_0 joint prob in log format

In [None]:
joint_log_ctg_0 = full_test_features.dot(np.log(prob_tokens_ctg_0_train_set) - np.log(prob_tokens_all_train_set)) + np.log(1 - prob_ctg_1) 

In [None]:
joint_log_ctg_0[:5]

## Simplify Joint probability in log format
* Only in the comparison situation can take away the snippet, in other condition, it will lead to a wrong result.

In [None]:
joint_log_ctg_1 = full_test_features.dot(np.log(prob_tokens_ctg_1_train_set)) + np.log(prob_ctg_1)
joint_log_ctg_0 = full_test_features.dot(np.log(prob_tokens_ctg_0_train_set)) + np.log(1-prob_ctg_1)

# Prediction

In [None]:
prediction = joint_log_ctg_1 > joint_log_ctg_0

* Check if the prediction got wrong.
* If the joint_log_fabric bigger than joint_log_trim, show true. times 1 is for the boolean tured into numbrer 0 or 1.
* If the material shows 1 means it is have more possibility as a fabric.
* Then we compare the category list of the test_set to see if the prediction is same as the category of test_set

In [None]:
joint_log_ctg_1.min()

In [None]:
prediction[-5:]*1

In [None]:
full_test_target[-5:]

### Check the accuracy of this model

In [None]:
correct_materials = (full_test_target == prediction).sum()
print('Does classified correctly', correct_materials)
num_materials_wrong = full_test_features.shape[0] - correct_materials
print('Does classfied incorrectly', num_materials_wrong)

### Accuracy %

In [None]:
correct_materials / len(full_test_features)

# Decision Boundary
## Visualizing the results

In [None]:
yaxis_label = 'P(X | ' + SUBJECT + ')'
xaxis_label = 'P(X | Others)'

linedata = np.linspace(start = -14000, stop = 1, num = 1000)

#Chart Styling
sns.set_style('whitegrid')
labels = 'Actual Category'

summary_df = pd.DataFrame({xaxis_label: joint_log_ctg_1, 
                           yaxis_label: joint_log_ctg_0, 
                           labels: full_test_target})

## The Decision Boundary
* Since we transfored both joint probability, fabric and trim, into log format, so the value will be minus.

In [None]:
sns.lmplot(x = xaxis_label, y = yaxis_label, 
           data =summary_df, height = 6.5, 
           fit_reg=False, legend=False, scatter_kws={'alpha': 0.3, 's': 25}, 
           hue = labels, markers = ['o', 'x'], palette = 'Set2' )

plt.xlim([joint_log_ctg_1.min() - 10, 1])
plt.ylim([joint_log_ctg_1.min() - 10, 1])

plt.plot(linedata, linedata, color = 'black')

plt.legend(('Decision Boundary', SUBJECT , 'Other material'), 
           loc='lower right', fontsize = 14)

plt.show()

# False Positives and False Negatives
* The function up.unique() shows the nubmer of value 0 is 2347, the value 1 is 650.

In [None]:
np.unique(prediction, return_counts = True)

### True positives
* It shows, the document is 1, the True, and model also predicts it as 1, True.
* In this case, the material is fabric in CATEGORY, and the prediction also say it is fabric.

In [None]:
true_pos = (y_test == 1)&(prediction == 1)

In [None]:
true_pos.sum()

### True negatives
* It shows, the document is 0, the False, and the model also predicts it as 0, False.
* In this case, the material is actually trim in CATEGORY, and the prediction also say it is trim.

In [None]:
true_neg = (y_test == 0)&(prediction ==0)

In [None]:
true_neg.sum()

### False positives
* It shows, the document is 0, the False, and model predicts it as 1, True.
* In this case, the material is trim in CATEGORY, and yet prediction says it is fabric.

In [None]:
false_pos = (y_test == 0)&(prediction ==1)

In [None]:
false_pos.sum()

### False negtives
* It shows, the document is 1, the True, and model predicts it as 0, False.
* In this case, the material is fabric in CATEGORY, and yet prediction says it is trim.

In [None]:
false_neg = (y_test == 1)&(prediction == 0)

In [None]:
false_neg.sum()

In [None]:
(y_test == 1).sum()

# Recall Score
* It tells how much the model will omit the document. More the value close to 1, less the model will omit to fit document.
* It tells the reliability of the model, in other words, how much the model resemble to the relevant documents.
* More the value close to 1, less the model will omit any document.

In [None]:
recall_score = true_pos.sum() / (y_test == 1).sum()
print('Recall score is {:.2%}'. format(recall_score))

# Precision Score
* It tells how precisely the model predicting.

In [None]:
precision_score = true_pos.sum() / (true_pos.sum() + false_pos.sum())
print('Precision score is {:.3}'.format(precision_score))

# F-Score or F1 Score
* Closer the value to 1, better the model is.

In [None]:
f1_score = 2 * (precision_score * recall_score) / (precision_score + recall_score)
print('F Score is {:.2}'.format(f1_score))