# 100 % same as the "mListFuncs.py" in Project G-Pro_M-List generator

## code

In [1]:
import pandas as pd
import numpy as np

# NLTK
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Switch area
VOCAB_SIZE = 900
CATEGORY_LIST = ['fabric', 'insulation', 'interfacing', 'zipper', 'label']
COL_LIST = ['item', 'description', 'color_way', 'position', 'spec']

## Delete_col()
### Delete the none columns in a table

In [2]:
def delete_col(m_list):
    for col in m_list:
        if m_list[col].count() == 0:
            m_list = m_list.drop(col, axis = 1)
    return m_list

## stemmered_nltk_convert()
###  Nltk stemmered Function

In [3]:
def stemmered_nltk_convert(col_of_df):
    '''
    Parameter of this function is a column of a dataFrame.

    '''
    # difine Stop words
    stop_words = set(stopwords.words('english'))
    # Difine Stemmer
    stemmer = SnowballStemmer('english')

    # converts to lower case and splits up the words
    words = word_tokenize(col_of_df)
    filtered_words = []

    for word in words:
        # Removes the stop words and punctuation
        # if word is not in the stop_words list and is not a alpha.
        if word not in stop_words and word.isalpha():
            filtered_words.append(stemmer.stem(word))

    return filtered_words

## turn_row_series()
### Walk through bom
### Copyed from 020_02_01 - M_list, original func name is turn_series()
* Parse a xlsm of bom, turn the row into a cell, all the cells will form a col.
* Put the cell to the classify function
* Return the index of row that is True
* Used in material-classify, M-List generator, since it try to classify material, that in a sheet is, usually entered as a row

In [2]:
def turn_row_series(bom):
    '''

    The parameter of the func is a dataFrame

    '''
    database = []

    for row in bom.index:
        row_str = str()
        for col in bom:
            row_str = row_str + ', ' + str(bom.at[row, col])
        database.append(row_str)

    col = pd.Series(database)
#     index_list = classify_series(col)

    return col


## turn_col_series()
### Walk through bom
### Copyed from 040_21 - Column-Classify
* Parse a xlsm of bom, turn the columns into cells, all the cells will form a col.
* Put the cell to the classify function
* Return the index of column that is True
* Used in column classify, since the information such as item, position, ref_no, supplier, etc, usually are entered as a column in bom sheet.

In [1]:
def turn_col_series(bom):
    '''
    bom: DataFrame,

    '''
    database = []

    for col in bom:
        col_str = str()
        for row in bom.index:
            col_str = col_str + ', ' + str(bom.at[row, col])
        database.append(col_str)

    col = pd.Series(database)
#     index_list = classify_series(col)

    return col  # Below is the original function

## make_sparse_matrix()
### Sparse Matrix Function
* Create a sparse Matrix for the data we want to predict
* The difference of this function in comparition with Classification Model for Train data, is this function don't need CATEGORY.

In [5]:
def make_sparse_matrix(df, vocabulary):
    """
    Param1:
    The data we want to sparse, which must be in format of DataFrame.
    Param2:
    The vocabulary, it is generated when we training datas.

    Returns a sparse matrix as dataframe
    """

    indexed_words = pd.Index(vocabulary.VOCAB_WORD)
    nr_rows = df.shape[0]
    nr_cols = df.shape[1]
    word_set = set(indexed_words)
    dict_list = []

    for i in range(nr_rows):
        for j in range(nr_cols):

            word = df.iat[i, j]
            if word in word_set:
                doc_id = df.index[i]
                word_id = indexed_words.get_loc(word)

                item = {'MATERIAL_ID': doc_id,
                        'OCCURENCE': 1, 'WORD_ID': word_id}

                dict_list.append(item)

    return pd.DataFrame(dict_list)

# make_full_feature()
## Full Matrix
* Since we want to predict the data, so we create the Full Feature directly.

In [6]:
def make_full_feature(sparse_matrix, nr_words, doc_idx=0, word_idx=1, freq_idx=2):
    column_names = ['MATERIAL_ID'] + list(range(0, VOCAB_SIZE))
    doc_id_names = np.unique(sparse_matrix[:, 0])
    full_matrix = pd.DataFrame(index=doc_id_names, columns=column_names)
    full_matrix.fillna(value=0, inplace=True)

    for i in range(sparse_matrix.shape[0]):
        doc_nr = sparse_matrix[i][doc_idx]
        word_id = sparse_matrix[i][word_idx]
        occurrence = sparse_matrix[i][freq_idx]

        full_matrix.at[doc_nr, 'MATERIAL_ID'] = doc_nr
        full_matrix.at[doc_nr, word_id] = occurrence

    full_matrix.set_index('MATERIAL_ID', inplace=True)
    return full_matrix

# Main Funciton -----------------------------------------------------
* The func in this area is not root func. The func here uses some other func as part of it.

## Part -1
## classifer()
* It merged, material_classfor() from 20, 30, and 40.
* It could classify in row, to be a M-List or classifying category of materials, or in column, to classify columns

In [7]:
def classifer(bom, switcher, SUBJECT):
    # DATA_SOURCE = ''
    if switcher == 1:
        DATA_SOURCE = 'DataSource/Trained-Data_M-List/'
        SUBJECT = 'material_garment'
    elif switcher == 2:
        DATA_SOURCE = 'DataSource/Trained-Data_ROW-classify/'
    elif switcher == 3:
        DATA_SOURCE = 'DataSource/Trained-Data_COL-classify/'
    VOCAB = DATA_SOURCE + SUBJECT + '_vocabulary.csv'
    TRAIN_DATA_1 = DATA_SOURCE + SUBJECT + '_prob_tokens_ctg_1_in_train_data'
    TRAIN_DATA_0 = DATA_SOURCE + SUBJECT + '_prob_tokens_ctg_0_in_train_data'
    TRAIN_DATA_ALL = DATA_SOURCE + SUBJECT + '_prob_tokens_all_in_train_data'
    PROB_1_TRAIN_DATA = DATA_SOURCE + SUBJECT + '_prob_ctg_1_in_train_data'
    '''
    Param_1
    Bom : DataFrame, bom turned into dataFrame before input to this function

    Parame_2
    Switcher : Int, Define what list your want to make.
    1 = Step_1 = M_List : turn a bom into M_List
    2 = Step_2 = Cagetory classify : classify the category of each material, usually in row, 
    3 = Step_3 = Column classify : classify the column name, usually in column

    Param_3
    SUBJECT : String, decide whitch trained_data to use.

    Param_4
    The path of the vocabulary
    Token list with WORD_ID

    Param_5
    The trained data of catagory True
    Probabilitie of each token in category True

    Param_6
    The trained data of category False
    Probabilitie of each token in category False

    Param_7
    The trained data of category both.
    Probabilitie of each token in all documents

    Param_8
    The percentage of documents in catagory True in all documents.
    Number of documents in catagory True / number of all documents
    '''
    # read the vocabulary
    vocab = pd.read_csv(VOCAB, index_col=0)
    # read the trained_datas
    train_data_1 = np.loadtxt(TRAIN_DATA_1)
    train_data_0 = np.loadtxt(TRAIN_DATA_0)
    train_data_all = np.loadtxt(TRAIN_DATA_ALL)
    prob_ctg_1 = pd.read_csv(PROB_1_TRAIN_DATA, index_col=0)
    prob_ctg_1_train_data = prob_ctg_1.loc[0, 'prob_ctg_1_train_set']

    # Read the file
    col_deleted_bom = delete_col(bom)

    # Switcher
    if switcher < 3:
        # For M_List of material classify, Parse the bom, make each cell in a row getting together to be 1 cell
        new_bom = turn_row_series(col_deleted_bom)
    else:
        # For column classify, Parse the bom, make each col getting together to be 1 cell
        new_bom = turn_col_series(col_deleted_bom)

    # nltk_convert
    stemmed_bom = new_bom.apply(stemmered_nltk_convert)

    # Convert the stemmed series into df
    # 1 token get 1 cell
    word_col_df = pd.DataFrame.from_records(stemmed_bom.tolist())

    # Sparse Matrix
    # Create a sparse Matrix for the data we want to predict
    # The difference of this function in comparition with Classification Model for Train data, is this function don't need CATEGORY.
    sparse_predict_df = make_sparse_matrix(word_col_df, vocab)
    # Grouped by MATERIAL_ID
    sparse_predict_df_grouped = sparse_predict_df.groupby(
        ['MATERIAL_ID', 'WORD_ID']).sum()
    # Reset it index
    sparse_predict_df_grouped = sparse_predict_df_grouped.reset_index()
    # Convert it into numpy array.
    sparse_predict_data = sparse_predict_df_grouped.to_numpy()

    # Full Matrix
    predict_full_feature = make_full_feature(
        sparse_predict_data, vocab.shape[0])

    # Joint probability in log format
    joint_log_ctg_1 = predict_full_feature.dot(
        np.log(train_data_1) - np.log(train_data_all)) + np.log(prob_ctg_1_train_data)
    joint_log_ctg_0 = predict_full_feature.dot(
        np.log(train_data_0)-np.log(train_data_all))+np.log(1 - prob_ctg_1_train_data)
    # Prediction
    prediction_log = joint_log_ctg_1 > joint_log_ctg_0

    # Get the index of the row that predicted as material in the bom
    row_list = prediction_log[prediction_log == True].index

    if switcher == 1:
        # For M_List, the job is finished, return it and save it as a CSV
        material_list = bom.loc[row_list, :]
        material_list.to_csv('result/M-List.csv')
        return material_list
    else:
        # For material category or column classify, return the index, for next function to finish it job
        return row_list


## Material classify Part-2
 * Analyze a M-List with several sets of train-data and vocabulary.
 * Each set of trainned-data and vocabulary represents 1 category of material, such as fabric, zipper, label.
 * This func will feed the func "material_classifor" each set of trainned-data and vocabulary by order.

In [9]:
def row_category(bomPath, Style_name, LIST=CATEGORY_LIST):
    '''
    Param_1
    bomPath : Path to M_list in CSV format

    Param_2
    style_name: String
    The name of the file.

    Param_3
    An array in type List.
    The content is the categories of material.
    That decides which train_set data to be used.
    '''
    # Set index_col=None, In this part we have to make new index for each row.
    bom = pd.read_csv(bomPath, index_col=None, encoding='ISO-8859-1')
    bom.insert(1, 'CATEGORY', 'other', True)

    for cate in LIST:
        ROW_LIST = classifer(bom, 2, cate)
        bom.at[ROW_LIST, 'CATEGORY'] = cate

    bom.to_csv('result/' + Style_name + '_ROW_classified_M-List.csv')
    return bom

## Part-3
## From 040_21 - Column_Classify
## Loop through materials designated as category.
* Analyze a Categoried_M-List with several sets of train-data and vocabulary.
* Each set of trainned-data and vocabulary represents 1 category of material, such as item, description, spec.
* This func will feed the func "material_classifor" each set of trainned-data and vocabulary by order.

In [None]:
def col_category(M_list, Style_name, LIST=COL_LIST):
    '''
    # Arguments
    Param_1
    bomPath : Path to M_list in CSV format

    Param_2
    Style_name: String,  
    the name of the file.

    Param_3
    LIST: List,  
    An array in type List. The content is the categories of material. That decides which train_set data to be used.

    # Returns
        A M_List in csv fomat, filled out the name of columns, such as 'item', 'description', 'spec', etc.
    '''
    # Set index_col=0, Since in the last part, we've created the index, here set the column with index 0 as the index here.
    bom = pd.read_csv(M_list, index_col=0, encoding='ISO-8859-1')

    # Add a new row
    new_bom = bom.append(pd.Series(name='TITLE'))

    new_bom.loc['TITLE', 'MATERIAL_ID'] = 'MATERIAL_ID'
    new_bom.loc['TITLE', 'CATEGORY'] = 'CATEGORY'

    for cate in LIST:
        ROW_LIST = classifer(bom, 3, cate)
        print(ROW_LIST.values)

        # Loop through the ROW_LIST, change the cell in the row "TITLE" and in the col that with integer postion as the Int in ROW_LIST.
        # notice, the method iat can only handle one col for one time, so here must use For Loop to get the job done.
        # Since the new row "TITLE" is on the buttom of the df, the new_bom, so the integer position of the row is '-1'
        num = 0
        for index in ROW_LIST:
            # since papa parse will ignore the duplicated columan, so we add index to make each column name uniqle, even when it in same name like "color_way"
            num = num + 1
            if cate == 'color_way':  # Because my model and trained data is named 'color_way'
                new_bom.iat[-1, index] = 'colorway_' + str(num)
            else:
                new_bom.iat[-1, index] = cate

    # # Update the empty Title as undefined
    # Since the type in numpy.float64 can not be rewrite with str, we have to cast it into float, which can keep the empty cell as empty, not return "nan", also can be rewrite by str

    num = 1
    for i in range(len(new_bom.iloc[-1])):
        # print(new_bom.iat[-1, i]) # Test Code
        print("the type", type(new_bom.iat[-1, i]))  # Test Code
        if(type(new_bom.iat[-1, i]) != str):
            if(type(new_bom.iat[-1, i]) == float):
                print("convert is triggered")
                new_bom.iat[-1, i] = str("undefined_" + str(num))
                num = num + 1

    # Set the row "TITLE" as the columns of the new_bom
    new_bom.columns = new_bom.iloc[-1]

    # # Delete the last row, that must and should be the row "TITLE"
    new_bom.drop(new_bom.index[-1], inplace=True)

    # Why here reset_index is becasue the papa-parse of javascript will take the 1st row as the header but not the column label in the dataframe
    # The column label in the dataFrame is the name of each column, but with no index, the index start from the 1st row as 0, but not the column of label.
    # So in here I have to reset the index again, make the index 0 start from the row, witch is inserted as row "TITLE", so when papa-parse to parse it, it will take the correct row as the header for each row
    new_bom.reset_index
    new_bom.to_csv('result/' + Style_name + '_COL_ROW_classified_M-List.csv')
    return new_bom


## Test area
* Test the functions above, 

## Walk through a dir - Not fixing yet !!!
* Check each JSON in a dir

In [4]:
def col_category_walkthrough_dir(FROM_DIR, TO_DIR):
    converted_csv_num = 0
    for root, dirnames, filenames in walk(FROM_DIR):
        # walk through each xlsx file
        for file_name in filenames:
             # get the path of the file
            # Appoint the method only work with .xlsx file.
            if file_name.endswith('.JSON') :
                converted_csv_num = converted_csv_num + 1
                filepath = join(root, file_name)
                # Custom function
                col_category(filepath, file_name[0:-18])
    print('Converted ', converted_csv_num, ' files')