# Test for the code copied from 020 and 040 to Project the "G-Pro_M-List generator API"


## code

In [1]:

import pandas as pd
import numpy as np

# NLTK
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


# Switch area
SUBJECT = 'material_garment'
VOCAB_SIZE = 900
COL_LIST =['item', 'description', 'color_way', 'position', 'spec', 'fabric']

## Delete_col()
### Delete the none columns in a table

In [2]:
def delete_col(m_list):
    for col in m_list:
        if m_list[col].count() == 0:
            m_list = m_list.drop(col, axis = 1)
    return m_list

## stemmered_nltk_convert()
###  Nltk stemmered Function

In [3]:
def stemmered_nltk_convert(col_of_df):
    '''
    Parameter of this function is a column of a dataFrame.

    '''
    # difine Stop words
    stop_words = set(stopwords.words('english'))
    # Difine Stemmer
    stemmer = SnowballStemmer('english')

    # converts to lower case and splits up the words
    words = word_tokenize(col_of_df)
    filtered_words = []

    for word in words:
        # Removes the stop words and punctuation
        # if word is not in the stop_words list and is not a alpha.
        if word not in stop_words and word.isalpha():
            filtered_words.append(stemmer.stem(word))

    return filtered_words


## turn_series()
### Walk through bom
### Copyed from 040_21 - Column-Classify
* Parse a xlsm of bom, turn the columns into cells, all the cells will form a col.
* Put the cell to the classify function
* Return the index of column that is True

In [4]:
def turn_series(bom):
    '''
    bom: DataFrame, 
    
    '''
    database = []
    
    for col in bom:
        col_str = str()
        for row in bom.index:
            col_str = col_str + ', ' + str(bom.at[row, col])
        database.append(col_str)
    
    col = pd.Series(database)
#     index_list = classify_series(col)
    
    return col#### Below is the original function

## make_sparse_matrix()
### Sparse Matrix Function
* Create a sparse Matrix for the data we want to predict
* The difference of this function in comparition with Classification Model for Train data, is this function don't need CATEGORY.

In [5]:
def make_sparse_matrix(df, vocabulary):
    """
    Param1:
    The data we want to sparse, which must be in format of DataFrame.
    Param2:
    The vocabulary, it is generated when we training datas.

    Returns a sparse matrix as dataframe
    """

    indexed_words = pd.Index(vocabulary.VOCAB_WORD)
    nr_rows = df.shape[0]
    nr_cols = df.shape[1]
    word_set = set(indexed_words)
    dict_list = []

    for i in range(nr_rows):
        for j in range(nr_cols):

            word = df.iat[i, j]
            if word in word_set:
                doc_id = df.index[i]
                word_id = indexed_words.get_loc(word)

                item = {'MATERIAL_ID': doc_id,
                        'OCCURENCE': 1, 'WORD_ID': word_id}

                dict_list.append(item)

    return pd.DataFrame(dict_list)

# make_full_feature()
## Full Matrix
* Since we want to predict the data, so we create the Full Feature directly.

In [6]:
def make_full_feature(sparse_matrix, nr_words, doc_idx=0, word_idx=1, freq_idx=2):
    column_names = ['MATERIAL_ID'] + list(range(0, VOCAB_SIZE))
    doc_id_names = np.unique(sparse_matrix[:, 0])
    full_matrix = pd.DataFrame(index=doc_id_names, columns=column_names)
    full_matrix.fillna(value=0, inplace=True)

    for i in range(sparse_matrix.shape[0]):
        doc_nr = sparse_matrix[i][doc_idx]
        word_id = sparse_matrix[i][word_idx]
        occurrence = sparse_matrix[i][freq_idx]

        full_matrix.at[doc_nr, 'MATERIAL_ID'] = doc_nr
        full_matrix.at[doc_nr, word_id] = occurrence

    full_matrix.set_index('MATERIAL_ID', inplace=True)
    return full_matrix

# Main Funciton -----------------------------------------------------
* The func in this area is not root func. The func here uses some other func as part of it.

# M-List_generator
* From 040_21 - Column_Classify
* Pick up material rows from a bom of xlsx and form it a dataFrame

In [7]:
def material_list_generator(JSONbom,
                            VOCAB='DataSource/Trained Data/' + SUBJECT + '_vocabulary.csv',
                            TRAIN_DATA_1='DataSource/Trained Data/' +
                            SUBJECT + '_prob_tokens_ctg_1_in_train_data',
                            TRAIN_DATA_0='DataSource/Trained Data/' +
                            SUBJECT + '_prob_tokens_ctg_0_in_train_data',
                            TRAIN_DATA_ALL='DataSource/Trained Data/' +
                            SUBJECT + '_prob_tokens_all_in_train_data',
                            PROB_1_TRAIN_DATA='DataSource/Trained Data/' + SUBJECT + '_prob_ctg_1_in_train_data'):
    '''
    Param_1
    Bom in JSON format, JSON is an array, the rows is an array under the JSON array.

    Param_2
    The path of the vocabulary
    Token list with WORD_ID

    Param_3
    The trained data of catagory True
    Probabilitie of each token in category True

    Param_4
    The trained data of category False
    Probabilitie of each token in category False

    Param_5
    The trained data of category both.
    Probabilitie of each token in all documents

    Param_6
    The percentage of documents in catagory True in all documents.
    Number of documents in catagory True / number of all documents
    '''
    # read the vocabulary
    vocab = pd.read_csv(VOCAB, index_col=0)
    # read the trained_datas
    train_data_1 = np.loadtxt(TRAIN_DATA_1)
    train_data_0 = np.loadtxt(TRAIN_DATA_0)
    train_data_all = np.loadtxt(TRAIN_DATA_ALL)
    prob_ctg_1 = pd.read_csv(PROB_1_TRAIN_DATA, index_col=0)
    prob_ctg_1_train_data = prob_ctg_1.loc[0, 'prob_ctg_1_train_set']

    # Read JSON
    bom = pd.read_json(JSONbom)

    # Delete useless cols
    col_deleted_bom = delete_col(bom)

    # Series
    # Parse the bom, make each col getting together to be 1 col
    new_bom = turn_series(col_deleted_bom)

    # nltk_convert
    stemmed_bom = new_bom.apply(stemmered_nltk_convert)

    # Convert the stemmed series into df
    # 1 token get 1 cell
    word_col_df = pd.DataFrame.from_records(stemmed_bom.tolist())

    # Sparse Matrix
    # Create a sparse Matrix for the data we want to predict
    # The difference of this function in comparition with Classification Model for Train data, is this function don't need CATEGORY.
    sparse_predict_df = make_sparse_matrix(word_col_df, vocab)
    # Grouped by MATERIAL_ID
    sparse_predict_df_grouped = sparse_predict_df.groupby(
        ['MATERIAL_ID', 'WORD_ID']).sum()
    # Reset it index
    sparse_predict_df_grouped = sparse_predict_df_grouped.reset_index()
    # Convert it into numpy array.
    sparse_predict_data = sparse_predict_df_grouped.to_numpy()

    # Full Matrix
    predict_full_feature = make_full_feature(
        sparse_predict_data, vocab.shape[0])

    # Joint probability in log format
    joint_log_ctg_1 = predict_full_feature.dot(
        np.log(train_data_1) - np.log(train_data_all)) + np.log(prob_ctg_1_train_data)
    joint_log_ctg_0 = predict_full_feature.dot(
        np.log(train_data_0)-np.log(train_data_all))+np.log(1 - prob_ctg_1_train_data)
    # Prediction
    prediction_log = joint_log_ctg_1 > joint_log_ctg_0

    # Get the index of the row that predicted as material in the bom
    row_list = prediction_log[prediction_log == True].index

    # Get the material from the original bom by the index in row_list
    material_list = bom.loc[row_list, :]
    material_list.to_csv('result/classified_M-List.csv')

    return material_list

# Material_Classifor Part-1
## From 040_21 - Column_Classify
* Classifor the materials in the M-List,
* Mateiral is in row direction in a M-List.
* Return a list of index of row that classified as "true"

In [8]:
def material_classifor(bom, SUBJECT):
    VOCAB  = 'DataSource/Trained Data/' + SUBJECT + '_vocabulary.csv'
    TRAIN_DATA_1 = 'DataSource/Trained Data/' + SUBJECT + '_prob_tokens_ctg_1_in_train_data'
    TRAIN_DATA_0 = 'DataSource/Trained Data/' + SUBJECT + '_prob_tokens_ctg_0_in_train_data'
    TRAIN_DATA_ALL = 'DataSource/Trained Data/' + SUBJECT + '_prob_tokens_all_in_train_data'
    PROB_1_TRAIN_DATA = 'DataSource/Trained Data/' + SUBJECT + '_prob_ctg_1_in_train_data'
    '''
    Param_1
    bom: String,
    The path of a DataFrame, 
    by reading M-List in csv format.
    
    Param_2
    SUBJECT: String, 
    It defines what to analyze, for example, of the SUBJECT == 'fabric', 
    then the func will use the trained_data set of fabric to analyze the documents.
    
    Local_var_1
    VOCAB: String, 
    The path of the vocabulary
    Token list with WORD_ID
    
    Local_var_2
    TRAIN_DATA_1: String, 
    The path of trained data
    The trained data of catagory True
    Probabilitie of each token in category True
    
    Local_var_3
    TRAIN_DATA_0: String, 
    The path of trained data
    The trained data of category False
    Probabilitie of each token in category False
    
    Local_var_4
    TRAIN_DATA_ALL: String
    The path of trained data
    The trained data of category both.
    Probabilitie of each token in all documents
    
    Local_var_5
    PROB_1_TRAIN_DATA: String
    The path of trained data
    The percentage of documents in catagory True in all documents.
    Number of documents in catagory True / number of all documents
    '''
    # read the vocabulary
    vocab = pd.read_csv(VOCAB, index_col = 0)
    # read the trained_datas
    train_data_1 = np.loadtxt(TRAIN_DATA_1)
    train_data_0 = np.loadtxt(TRAIN_DATA_0)
    train_data_all = np.loadtxt(TRAIN_DATA_ALL)
    prob_ctg_1 = pd.read_csv(PROB_1_TRAIN_DATA, index_col = 0)
    prob_ctg_1_train_data = prob_ctg_1.loc[0, 'prob_ctg_1_train_set']
    
    
    # Delete useless cols
    col_deleted_bom = delete_col(bom)
    
    # Series
    # Parse the bom, make each col getting together to be 1 col
    new_bom = turn_series(col_deleted_bom)
    
    # nltk_convert
    stemmed_bom = new_bom.apply(stemmered_nltk_convert)
    
    # Convert the stemmed series into df
    # 1 token get 1 cell
    word_col_df = pd.DataFrame.from_records(stemmed_bom.tolist())
    
    # Sparse Matrix
    # Create a sparse Matrix for the data we want to predict
    # The difference of this function in comparition with Classification Model for Train data, is this function don't need CATEGORY.
    sparse_predict_df = make_sparse_matrix(word_col_df, vocab)
    # Grouped by MATERIAL_ID
    sparse_predict_df_grouped = sparse_predict_df.groupby(['MATERIAL_ID', 'WORD_ID']).sum()
    # Reset it index
    sparse_predict_df_grouped = sparse_predict_df_grouped.reset_index()
    # Convert it into numpy array.
    sparse_predict_data = sparse_predict_df_grouped.to_numpy()
    
    #Full Matrix
    predict_full_feature = make_full_feature(sparse_predict_data, vocab.shape[0])
    
    #Joint probability in log format
    joint_log_ctg_1 = predict_full_feature.dot(np.log(train_data_1) - np.log(train_data_all)) + np.log(prob_ctg_1_train_data)
    joint_log_ctg_0 = predict_full_feature.dot(np.log(train_data_0)-np.log(train_data_all))+np.log(1 - prob_ctg_1_train_data)
    # Prediction
    prediction_log = joint_log_ctg_1 > joint_log_ctg_0
    
    # Get the index of the row that predicted as material in the bom
    row_list = prediction_log[prediction_log == True].index
    print(row_list)
    #2020/03/11
    # The difference with the func "M-List_generator" is that the func only return a list of index classified as True.
    # Later I may optimize the func "M-List_generator" same as this func, so the two func can use same code as this func.
    # Let the different part be done outside the func.
#     # Get the material from the original bom by the index in row_list
#     material_list = bom.loc[row_list,:]
    
    return row_list

# Material classify Part-2
## Loop through materials designated as category.
* Analyze a Categoried_M-List with several sets of train-data and vocabulary.
* Each set of trainned-data and vocabulary represents 1 category of material, such as item, description, spec.
* This func will feed the func "material_classifor" each set of trainned-data and vocabulary by order.

In [9]:
def fill_out_col_category(M_list, Style_name, LIST = COL_LIST) :
    '''
    # Arguments
    M_list: String,  
    the path of M_list in CSV format. M_list in CSV format. The func will turn it into a DataFrame while calculating.
            
    Style_name: String,  
    the name of the file.
            
    LIST: List,  
    An array in type List. The content is the categories of material. That decides which train_set data to be used.
    
    # Returns
        A M_List in csv fomat, filled out the name of columns, such as 'item', 'description', 'spec', etc.
    '''
    #bom = pd.read_csv(M_list, index_col = None, encoding = 'ISO-8859-1')
    bom = pd.read_json(M_list, encoding = 'ISO-8859-1')
    #The code below may not using in the col classify.
    #bom.insert(1, 'CATEGORY', 'other', True)

    # Add a new row
    new_bom = bom.append(pd.Series(name = 'TITLE'))
    
    new_bom.loc['TITLE', 'MATERIAL_ID'] = 'MATERIAL_ID'
    new_bom.loc['TITLE', 'CATEGORY'] = 'CATEGORY'


    for cate in LIST:
        ROW_LIST = material_classifor(bom, cate)
        print(ROW_LIST.values)
        
        # Loop through the ROW_LIST, change the cell in the row "TITLE" and in the col that with integer postion as the Int in ROW_LIST. 
        # notice, the method iat can only handle one col for one time, so here must use For Loop to get the job done.
        # Since the new row "TITLE" is on the buttom of the df, the new_bom, so the integer position of the row is '-1'
        for index in ROW_LIST:
            new_bom.iat[-1, index] = cate
            
    #Set the row "TITLE" as the columns of the new_bom
    new_bom.columns = new_bom.iloc[-1]
    #Delete the last row, that must and should be the row "TITLE"
    new_bom.drop(new_bom.index[-1], inplace = True)
    
    new_bom.to_csv('result/Col_Classify_M-List/' + Style_name + '_COL_classified_M-List.csv')
    return new_bom

In [10]:

# print(JSON_BOM)
JSON_BOM ='DataSource/JSON_original_bom/BOM.json'


In [11]:
material_list_generator(JSON_BOM)


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9
MATERIAL_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,,,,,,Designer: JG,,,,
2,,,,,,Season: Winter 2017/18,,,,
3,,,Insulation:,"60gm Insulation at body sleeve,hood",,Collection: Freedom,,,,


In [12]:
fill_out_col_category(JSON_BOM, 'Test_JSON')



Int64Index([], dtype='int64', name='MATERIAL_ID')
[]
Int64Index([], dtype='int64', name='MATERIAL_ID')
[]
Int64Index([5, 6, 7, 8], dtype='int64', name='MATERIAL_ID')
[5 6 7 8]
Int64Index([3], dtype='int64', name='MATERIAL_ID')
[3]
Int64Index([], dtype='int64', name='MATERIAL_ID')
[]
Int64Index([], dtype='int64', name='MATERIAL_ID')
[]


TITLE,NaN,NaN.1,NaN.2,position,NaN.3,color_way,color_way.1,color_way.2,color_way.3,NaN.4,MATERIAL_ID,CATEGORY
0,AIRBLASTER,,,,,Style Number: AB18MJ2_081,,,,,,
1,,,,,,Designer: JG,,,,,,
2,,,,,,Season: Winter 2017/18,,,,,,
3,,,Insulation:,"60gm Insulation at body sleeve,hood",,Collection: Freedom,,,,,,
4,Style Name:,,Seams:,Critically Taped,,Vendor: Soluna,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
56,3,ZIPPER DIRECTION:,"CF has right hand slider, pocket zippers close...",,,,,,,,,
57,4,BARTACK:,At stress points,,,,,,,,,
58,5,HEAT CUT:,"Elastic, webbing, taffeta",,,,,,,,,
59,6,GROMMETS/SNAPS:,Should have pellon backing,,,,,,,,,
