# Import nltk tools

In [224]:
import pandas as pd
import numpy as np

import nltk
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split


# Wrod processing

In [225]:
data = pd.read_csv('Matrix/f_matrix.csv')

* Name the index as "MATERIAL_IDs"
* The matrix must has it unrepeated Id before import to here, show as the "Unnamed: 0".

In [226]:
data.head()

Unnamed: 0.1,Unnamed: 0,material,fabric
0,0,", lm792, shell fabric at hood, above chest, lo...",1
1,1,", sotl085-pu01, shell fabric at hood, above ch...",1
2,2,", lm792, shell fabric at mid chest panels, , , ,",1
3,3,", sotl085-pu01, shell fabric at mid chest pane...",1
4,4,", ym11-3527, stretch mesh , , , , back pocket bag",1


* Change the col "Unnamed: 0" as "MATERIAL_ID"

In [227]:
data.rename( columns = {'Unnamed: 0': 'MATERIAL_ID'}, inplace = True)

In [228]:
data.head()

Unnamed: 0,MATERIAL_ID,material,fabric
0,0,", lm792, shell fabric at hood, above chest, lo...",1
1,1,", sotl085-pu01, shell fabric at hood, above ch...",1
2,2,", lm792, shell fabric at mid chest panels, , , ,",1
3,3,", sotl085-pu01, shell fabric at mid chest pane...",1
4,4,", ym11-3527, stretch mesh , , , , back pocket bag",1


* Set the original index as Index

In [229]:
data.set_index('MATERIAL_ID', inplace = True)

In [230]:
data.tail()

Unnamed: 0_level_0,material,fabric
MATERIAL_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
9456,", brushed tricot, , 100% polyester, , , hand p...",1
9457,", brushed tricot, , 100% polyester, , , chin g...",1
9458,", regular mesh, , 100% polyester, , , underarm...",1
9459,", regular mesh , , 100% polyester, , , goggle ...",1
9460,", stretch lycra, , content, , , sleeve gaiters",1


## Nltk Function

In [231]:
def nltk_convert(col_of_df):
    '''
    Parameter of this function is a column of a dataFrame.
    
    '''
    # difine Stop words
    stop_words = set(stopwords.words('english'))
    # Difine Stemmer
    stemmer = SnowballStemmer('english')
    
    # converts to lower case and splits up the words
    words = word_tokenize(col_of_df)
    filtered_words = []
    
    for word in words:
        # Removes the stop words and punctuation
        # if word is not in the stop_words list and is not a alpha.
        if word not in stop_words and word.isalpha():
#             filtered_words.append(stemmer.stem(word))
            filtered_words.append(word)
            
    return filtered_words

* Test the function, with row 0, in column 'material'

In [232]:
nltk_convert(data.at[0, 'material'])

['shell',
 'fabric',
 'hood',
 'chest',
 'lower',
 'front',
 'body',
 'back',
 'body',
 'sleeves',
 'facing',
 'fabric',
 'windflap',
 'hood',
 'shockcord',
 'tunnel',
 'self',
 'fabric',
 'seam',
 'binding',
 'needed',
 'shockcord',
 'tunnel']

## Applying to the matrix
* the variable "nested_list" will hold the result.
* It is a series in this moment.

In [233]:
%%time
nested_list = data.material.apply(nltk_convert)

CPU times: user 2.47 s, sys: 212 ms, total: 2.68 s
Wall time: 2.69 s


In [234]:
type(nested_list)

pandas.core.series.Series

## True and False categories
* Exact the index of fabric materials, and non fabric materials

In [235]:
doc_ids_fabric = data[data.fabric == 1].index
doc_ids_trim = data[data.fabric ==0].index

In [236]:
doc_ids_trim

Int64Index([   6,    7,    8,    9,   10,   11,   12,   13,   14,   15,
            ...
            9394, 9395, 9396, 9397, 9398, 9399, 9400, 9401, 9402, 9403],
           dtype='int64', name='MATERIAL_ID', length=7411)

### Get materials back by index
* Exact the strings in col 'material' by the index of doc_ids_fabric

In [237]:
nested_list_fabric = nested_list.loc[doc_ids_fabric]

In [246]:
nested_list_fabric

MATERIAL_ID
0       [shell, fabric, hood, chest, lower, front, bod...
1       [shell, fabric, hood, chest, lower, front, bod...
2                     [shell, fabric, mid, chest, panels]
3                     [shell, fabric, mid, chest, panels]
4                      [stretch, mesh, back, pocket, bag]
                              ...                        
9456     [brushed, tricot, polyester, hand, pocket, bags]
9457            [brushed, tricot, polyester, chin, guard]
9458          [regular, mesh, polyester, underarm, vents]
9459           [regular, mesh, polyester, goggle, pocket]
9460           [stretch, lycra, content, sleeve, gaiters]
Name: material, Length: 2050, dtype: object

* by the index of doc_ids_trim

In [239]:
nested_list_trim = nested_list.loc[doc_ids_trim]

In [244]:
nested_list_trim.shape[0]

7411

## Number of Distinct words
* The fabric

In [241]:
flat_list_fabric =[item for sublist in nested_list_fabric for item in sublist]

In [242]:
fabric_words = pd.Series(flat_list_fabric).value_counts()

In [252]:
fabric_words[:10]

pocket       438
lining       407
body         402
polyester    373
fabric       305
shell        301
nylon        284
supplied     252
hood         234
main         226
dtype: int64

* The Trims

In [178]:
flat_list_trim = [item for sublist in nested_list_trim for item in sublist]

In [179]:
trim_words = pd.Series(flat_list_trim).value_counts()

In [180]:
trim_words.head()

zipper    1357
pocket    1261
label     1221
left      1032
front      969
dtype: int64

# Generate Vocabulary
## Top 10 words in Matrix (fabric and trim)

* It is same as function the nltk_convert, the difference is in the end of the funciton, it not append word directly, but word converted by function stemmer.stem.

In [254]:
def stemmered_nltk_convert(col_of_df):
    '''
    Parameter of this function is a column of a dataFrame.
    
    '''
    # difine Stop words
    stop_words = set(stopwords.words('english'))
    # Difine Stemmer
    stemmer = SnowballStemmer('english')
    
    # converts to lower case and splits up the words
    words = word_tokenize(col_of_df)
    filtered_words = []
    
    for word in words:
        # Removes the stop words and punctuation
        # if word is not in the stop_words list and is not a alpha.
        if word not in stop_words and word.isalpha():
            filtered_words.append(stemmer.stem(word))
            
    return filtered_words

In [256]:
%%time
stemmed_nested_list = data.material.apply(stemmered_nltk_convert)

CPU times: user 3.57 s, sys: 251 ms, total: 3.82 s
Wall time: 3.87 s


* Get flat list for all words

In [259]:
flat_stemmed_nested_list = [item for sublist in stemmed_nested_list for item in sublist]

* Turn it to a series

In [260]:
unique_words = pd.Series(flat_stemmed_nested_list).value_counts()

In [264]:
unique_words

pocket               2144
zipper               1638
label                1435
front                1151
left                 1053
                     ... 
luk                     1
dip                     1
defenderdiscoveri       1
winter                  1
type                    1
Length: 994, dtype: int64

In [265]:
unique_words.head()

pocket    2144
zipper    1638
label     1435
front     1151
left      1053
dtype: int64

* Set the size of vacabulary
* This size need to be tested a lot, in this cace, follow the e-mail processing lecture, I takes 10 % of total particular words

In [373]:
VOCAB_SIZE = 100

In [374]:
frequent_words = unique_words[0:VOCAB_SIZE]

In [375]:
print('Most common words: \n', frequent_words[:10])

Most common words: 
 pocket    2144
zipper    1638
label     1435
front     1151
left      1053
back      1018
side       957
seam       950
hood       861
hand       756
dtype: int64


## Create Vocabulary with WORD_ID

In [376]:
word_ids = list(range(0, VOCAB_SIZE))
vocab = pd.DataFrame({'VOCAB_WORD': frequent_words.index.values}, index = word_ids)
vocab.index.name = 'WORD_ID'
vocab.head()

Unnamed: 0_level_0,VOCAB_WORD
WORD_ID,Unnamed: 1_level_1
0,pocket
1,zipper
2,label
3,front
4,left


* Save the vocabulary

In [377]:
vocab.to_csv('Matrix/garment_vocabulary.csv')

* Test the any word to see if it is in the vocabulary

In [378]:
'steve' in set(vocab.VOCAB_WORD)

False

## Search out the material with more vocabulary

In [379]:
clean_material_lengths = [len(sublist) for sublist in stemmed_nested_list]
print('Nr wrods in the longest material:', max(clean_material_lengths))

Nr wrods in the longest material: 27


In [380]:
print('Material position in the list', np.argmax(clean_material_lengths))

Material position in the list 645


In [381]:
all_words_list[np.argmax(clean_material_lengths)]

['libolon',
 'shell',
 'pes',
 'tpu',
 'cm',
 'hood',
 'collar',
 'bodi',
 'bottom',
 'part',
 'sleev',
 'airvent',
 'flap',
 'face',
 'hood',
 'collar',
 'bodi',
 'sleev',
 'hem',
 'chest',
 'pocket',
 'storm',
 'flap',
 'cf',
 'zip',
 'snowskirt',
 'garag']

# Generate Features & Spare Matrix
### Creating a DataFrame with one word per column

In [382]:
type(stemmed_nested_list)

pandas.core.series.Series

In [383]:
type(stemmed_nested_list.tolist())

list

In [384]:
word_col_df = pd.DataFrame.from_records(stemmed_nested_list.tolist())
word_col_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17,18,19,20,21,22,23,24,25,26
0,shell,fabric,hood,chest,lower,front,bodi,back,bodi,sleev,...,fabric,seam,bind,need,shockcord,tunnel,,,,
1,shell,fabric,hood,chest,lower,front,bodi,back,bodi,sleev,...,fabric,seam,bind,need,shockcord,tunnel,,,,
2,shell,fabric,mid,chest,panel,,,,,,...,,,,,,,,,,
3,shell,fabric,mid,chest,panel,,,,,,...,,,,,,,,,,
4,stretch,mesh,back,pocket,bag,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9456,brush,tricot,polyest,hand,pocket,bag,,,,,...,,,,,,,,,,
9457,brush,tricot,polyest,chin,guard,,,,,,...,,,,,,,,,,
9458,regular,mesh,polyest,underarm,vent,,,,,,...,,,,,,,,,,
9459,regular,mesh,polyest,goggl,pocket,,,,,,...,,,,,,,,,,


## Traning Set and Test Set
* Must import the method "train_test_split()" from sklearn

In [385]:
X_train, X_test, y_train, y_test = train_test_split(word_col_df, data.fabric,
                                                    test_size = 0.3)

In [386]:
print('Nr of traning samples', X_train.shape[0])
print('Fraction of traning set', X_train.shape[0] / word_col_df.shape[0])

Nr of traning samples 6622
Fraction of traning set 0.6999260120494663


In [387]:
X_train.index.name = X_test.index.name = 'MATERIAL_ID'
X_train.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,17,18,19,20,21,22,23,24,25,26
MATERIAL_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7376,cm,x,hand,pocket,flap,,,,,,...,,,,,,,,,,
7786,leather,back,right,belt,loop,,,,,,...,,,,,,,,,,
168,care,content,label,interior,left,side,seam,,,,...,,,,,,,,,,
9231,x,velcro,round,corner,chin,,,,,,...,,,,,,,,,,
5563,solid,nylon,main,bodi,,,,,,,...,,,,,,,,,,


In [388]:
y_train.head()

MATERIAL_ID
7376    0
7786    1
168     0
9231    0
5563    1
Name: fabric, dtype: int64

## Sparse Matrix for the traning data
* Get vocabulary turned as an Index:

In [389]:
word_index = pd.Index(vocab.VOCAB_WORD)
type(word_index[3])

str

In [390]:
y_train.shape[0]

6622

### Sparse Matrix Function

In [407]:
def make_sparse_matrix(df, indexed_words, labels):
    """
    Returns a sparse matrix as dataframe
    """
    
    nr_rows = df.shape[0]
    nr_cols = df.shape[1]
    word_set = set(indexed_words)
    dict_list = []
    
    for i in range(nr_rows):
        for j in range(nr_cols):
            
            word = df.iat[i, j]
            if word in word_set:
                doc_id = df.index[i]
                word_id = indexed_words.get_loc(word)
                category = labels.at[doc_id]
                
                item = {'LABEL': category, 'MATERIAL_ID': doc_id,
                       'OCCURENCE': 1, 'WORD_ID': word_id}
                
                dict_list.append(item)
                
    return pd.DataFrame(dict_list)

## Training data
### Applying to the traning set

In [443]:
%%time
sparse_train_df = make_sparse_matrix(X_train, word_index, y_train)

CPU times: user 1.34 s, sys: 21 ms, total: 1.36 s
Wall time: 1.67 s


In [444]:
sparse_train_df

Unnamed: 0,LABEL,MATERIAL_ID,OCCURENCE,WORD_ID
0,0,7376,1,38
1,0,7376,1,24
2,0,7376,1,9
3,0,7376,1,0
4,0,7376,1,41
...,...,...,...,...
29332,0,3246,1,28
29333,0,3246,1,26
29334,0,3246,1,14
29335,0,3653,1,12


In [410]:
sparse_train_df.shape

(29337, 4)

### Group by the DOC_ID

In [490]:
train_grouped = sparse_train_df.groupby(['MATERIAL_ID', 'WORD_ID', 'LABEL']).sum()
train_grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,OCCURENCE
MATERIAL_ID,WORD_ID,LABEL,Unnamed: 3_level_1
0,3,1,1
0,5,1,1
0,7,1,1
0,8,1,2
0,17,1,1
...,...,...,...
9459,81,1,1
9459,84,1,1
9460,17,1,1
9460,47,1,1


* Distribute the DOC_ID to each row

In [491]:
train_grouped = train_grouped.reset_index()

In [492]:
train_grouped

Unnamed: 0,MATERIAL_ID,WORD_ID,LABEL,OCCURENCE
0,0,3,1,1
1,0,5,1,1
2,0,7,1,1
3,0,8,1,2
4,0,17,1,1
...,...,...,...,...
27224,9459,81,1,1
27225,9459,84,1,1
27226,9460,17,1,1
27227,9460,47,1,1


### Save Traning Data
* We save it as a txt file, with only numbers of the IDs, Occurances, label, as showed above. 

In [486]:
np.savetxt('Training Data/f_sparse_traning_data', train_grouped, fmt = '%d')

## Test Data
* The way to create it is basically same way as treating traning data.

In [418]:
X_test.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,17,18,19,20,21,22,23,24,25,26
MATERIAL_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3235,trim,puller,rubberref,dragon,time,hand,pocket,,,,...,,,,,,,,,,
2550,line,velveti,tricot,suppli,soli,insid,wiastband,inner,front,pocket,...,,,,,,,,,,
6791,seam,,,,,,,,,,...,,,,,,,,,,
8912,metal,eyelet,hem,adjust,,,,,,,...,,,,,,,,,,
582,hang,tag,head,tag,inlay,inlay,insid,main,,,...,,,,,,,,,,


In [417]:
y_test.head()

MATERIAL_ID
3235    0
2550    1
6791    0
8912    0
582     0
Name: fabric, dtype: int64

In [422]:
%%time
sparse_test_df = make_sparse_matrix(X_test, word_index, y_test)

CPU times: user 623 ms, sys: 10.5 ms, total: 634 ms
Wall time: 909 ms


In [426]:
test_grouped = sparse_test_df.groupby(['MATERIAL_ID', 'WORD_ID', 'LABEL']).sum().reset_index()
test_grouped.head()

Unnamed: 0,MATERIAL_ID,WORD_ID,LABEL,OCCURENCE
0,5,44,1,1
1,5,72,1,1
2,7,0,0,1
3,7,1,0,1
4,7,5,0,1


In [487]:
np.savetxt('Training Data/f_sparse_test_data', test_grouped, fmt = '%d')

# Full Matrix
* Create anempty df first

### Columns of Full Matrix
* The empty df with column 'DOC_ID', in the case I use 'MATERIAL_ID'
* Second column is 'CATEGORY'
* The rest column is number from 0 to the number of words in vocabulary.

In [436]:
column_names = ['MATERIAL_ID'] + ['CATEGORY'] + list(range(0, VOCAB_SIZE))

### Rows of Full Matrix
* If consists of Material_ID in Training Data
* Numpy method np.unique() will take a number 1 time, even it occurs many time in the Traning Data.
* Import the traning data we saved as txt

In [473]:
TRANING_DATA = 'Training Data/f_traning_data'

* Open is by Numpy

In [474]:
sparse_train_data = np.loadtxt(TRANING_DATA, delimiter = ' ', dtype=int)

In [475]:
index_names = np.unique(sparse_train_data[:, 0])

### Form the Data Frame
* Create an empty df, columned by column_names, indexed by index_names
* Fullfill all the cell with value "0"

In [476]:
full_train_data = pd.DataFrame(index= index_names, columns = column_names)
full_train_data.fillna(value = 0, inplace = True)
full_train_data.shape

(6405, 102)

## Make a Full matrix function

In [478]:
def make_full_matrix(sparse_matrix, nr_words, doc_idx = 0, word_idx = 1, cat_idx = 2, freq_idx = 3):
    column_names = ['MATERIAL_ID'] + ['CATEGORY'] + list(range(0, VOCAB_SIZE))
    doc_id_names = np.unique(sparse_matrix[:,0])
    full_matrix = pd.DataFrame(index = doc_id_names, columns = column_names)
    full_matrix.fillna(value=0, inplace=True)
    
    for i in range(sparse_matrix.shape[0]):
        doc_nr = sparse_matrix[i][doc_idx]
        word_id = sparse_matrix[i][word_idx]
        label = sparse_matrix[i][cat_idx]
        occurrence = sparse_matrix[i][freq_idx]
        
        full_matrix.at[doc_nr, 'MATERIAL_ID'] = doc_nr
        full_matrix.at[doc_nr, 'CATEGORY'] = label
        full_matrix.at[doc_nr, word_id] = occurrence
        
    full_matrix.set_index('MATERIAL_ID', inplace = True)
    return full_matrix

In [479]:
%%time
full_train_data = make_full_matrix(sparse_train_data, VOCAB_SIZE)

CPU times: user 672 ms, sys: 21.8 ms, total: 694 ms
Wall time: 709 ms


In [480]:
full_train_data

Unnamed: 0_level_0,CATEGORY,0,1,2,3,4,5,6,7,8,...,90,91,92,93,94,95,96,97,98,99
MATERIAL_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,0,0,0,1,0,1,0,1,2,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,1,0,1,0,1,2,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9456,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9457,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9458,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9459,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Calculating the probability of fabric

In [481]:
full_train_data.CATEGORY.size

6405

In [482]:
full_train_data.CATEGORY.sum()

1379

In [483]:
prob_fabric = full_train_data.CATEGORY.sum() / full_train_data.CATEGORY.size
print('Probability of fabric is', prob_fabric)

Probability of fabric is 0.21530054644808744


# Count each material in train_data have how many tokens
## Full train Features
* It is very simple. The whole df of train_data without the column 'CATEGORY' is the Full train Features
* Create a new df with no column 'CATEGORY'

In [484]:
full_train_features = full_train_data.loc[:, full_train_data.columns != 'CATEGORY']

In [485]:
full_train_features.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
MATERIAL_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,1,0,1,0,1,2,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,1,0,1,2,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Number of tokens each material has in train_data
* We added up all values in row direction, to get sum of each row 

In [521]:
material_train_data = full_train_features.sum(axis = 1)
material_train_data.shape # Show how many materials in train_data

(6405,)

In [520]:
material_train_data # Show how many words each material in train_data has

MATERIAL_ID
0       15
1       15
2        3
3        3
4        4
        ..
9456     5
9457     2
9458     4
9459     4
9460     3
Length: 6405, dtype: int64

* Total number of words in train_data

In [522]:
total_words_train_data = material_train_data.sum()
total_words_train_data

29337

#### Token of fabric in train_data
* Number of particular words of category 1, the fabric, in train_data
* The word can be called "token"

In [536]:
f_in_train_data = material_train_data[full_train_data.CATEGORY == 1]
f_in_train_data.shape
# 1379 materials in train_data is fabric

(1379,)

In [555]:
f_token_in_train_data = f_in_train_data.sum()
f_token_in_train_data
# The total words in Category 1, the fabric, in the train_data

5958

#### Token of trim in train_data
* Number of particular words of category 0, the trim, in train_data

In [527]:
t_in_train_data = material_train_data[full_train_data.CATEGORY == 0]
t_in_train_data.shape 
# Total, 5026 materials in train_data is trim

(5026,)

In [528]:
t_token_in_train_data = t_in_train_data.sum()
t_token_in_train_data 
# Total words in Category 0, the trim, in the train_data

23379

### Average number of words in fabric and trim
* In this case, CATEGORY 1 is fabric, 0 is trim

In [530]:
print('Average nr of words in fabric {:.0f}'.format(f_token_in_train_data / f_in_train_data.shape[0]))
print('Average nr of words in trim {:.0f}'.format(t_token_in_train_data / t_in_train_data.shape[0]))

Average nr of words in fabric 4
Average nr of words in trim 5


In [533]:
# The snippet is for testing if the calculating get problem.
# The answer should eaquls to 0
material_train_data.shape[0] - f_in_train_data.shape[0] - t_in_train_data.shape[0]

0

# Count each token's frequency in both Category
## Full train Features of Fabric
* It is very simple. The df of train_data in 'CATEGORY'1, and without the column 'CATEGORY' is the Full train Features of Fabric

In [537]:
f_full_train_features = full_train_features.loc[full_train_data.CATEGORY == 1]

In [558]:
f_full_train_features

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
MATERIAL_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,1,0,1,0,1,2,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,1,0,1,2,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9456,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
9457,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9458,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9459,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Full train feature of trim

In [547]:
t_full_train_features = full_train_features.loc[full_train_data.CATEGORY == 0]

In [548]:
t_full_train_features

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
MATERIAL_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
8,1,1,0,0,0,2,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
13,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,2,0,0,2,0,0
14,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9398,1,1,0,0,1,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
9400,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9401,0,0,1,0,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
9402,0,0,1,0,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


### Frequency of tokens in fabric in train_data
* We add up all value in column direction to get sum of each column.
* The sum represents each word occurs how many time in the category 1, the fabric.
* It is possible that some words in vocabulary never appear in this category, so we need "+ 1" to this token. Since later we'll divide it to get the percentage, if you divide 0, it may cause error.

In [551]:
summed_f_tokens_in_train_data = f_full_train_features.sum(axis = 0) + 1

In [552]:
summed_f_tokens_in_train_data

0     387
1      12
2       8
3     131
4      16
     ... 
95     16
96    116
97      1
98      2
99      2
Length: 100, dtype: int64

### Frequency of tokens in trim in train_data
* The way treat it as treating with fabric

In [553]:
summed_t_tokens_in_train_data = t_full_train_features.sum(axis = 0) + 1

In [554]:
summed_t_tokens_in_train_data

0     1105
1     1123
2      957
3      686
4      710
      ... 
95      99
96       5
97     114
98     118
99     123
Length: 100, dtype: int64

# Probability of tokens
* Why here not only divided the frequency of each token by total tokens but also the size of vocabulary? It is becasue we add "+ 1" for each word when counting frequency, so add the size of vocabulary is for balancing the number back.

### Probability of tokens in fabric in train_data

In [559]:
prob_tokens_fabric = summed_f_tokens_in_train_data / (f_token_in_train_data + VOCAB_SIZE)

In [567]:
prob_tokens_fabric

0     0.063882
1     0.001981
2     0.001321
3     0.021624
4     0.002641
        ...   
95    0.002641
96    0.019148
97    0.000165
98    0.000330
99    0.000330
Length: 100, dtype: float64

### Probability of tokens in trim in train_data

In [561]:
prob_tokens_trim = summed_t_tokens_in_train_data / (t_token_in_train_data + VOCAB_SIZE)

In [566]:
prob_tokens_trim

0     0.047063
1     0.047830
2     0.040760
3     0.029218
4     0.030240
        ...   
95    0.004217
96    0.000213
97    0.004855
98    0.005026
99    0.005239
Length: 100, dtype: float64

* Check if the number correct.
* The answer should be "1"

In [565]:
prob_tokens_trim.sum()

1.0

### Probability of tokens in train_data

In [568]:
prob_tokens_all = full_train_features.sum(axis = 0) / total_words_train_data

In [570]:
prob_tokens_all

0     0.050789
1     0.038620
2     0.032825
3     0.027781
4     0.024679
        ...   
95    0.003852
96    0.004056
97    0.003852
98    0.004022
99    0.004193
Length: 100, dtype: float64

In [571]:
prob_tokens_all.sum()

0.9999999999999999

# Save the trained Model

In [573]:
np.savetxt('Training Data/prob_tokens_f_in_train_data' ,prob_tokens_fabric)
np.savetxt('Training Data/prob_tokens_t_in_train_data', prob_tokens_trim)
np.savetxt('Training Data/prob_tokens_all_in_train_data', prob_tokens_all)