<a id='sec0'></a>

1. Extraction by fraction of appearance fold difference between classes
   - Thresholds for minimum fraction of appearance and fold difference
2. Extraction by exclusive appearance in certain classes
   - Threshold for minimum fraction of appearance (Can be lower than #1)
3. Extraction by per-doc appearance fold difference between classes
   - Thresholds for minimum per-doc appearance and fold difference
4. Extraction by per-doc appearance number difference between classes
   - Thresholds for minimum per-doc appearance and # of difference in appearance between classes

# 1. Extraction by fraction of appearance fold difference between classes
(<a href='#sec0'>Back to top</a>)

In [None]:
def get_FoldDifference(a, b, freq_threshold=0.05):
    if ((b != 0) and (a >= freq_threshold)):
        return (a / b)
    else:
        return 0

def get_nClassWords_byFold(frac_docs, fold_threshold, freq_threshold, print_result=True):
    '''
    This function looks at each word in each doc in the 'docs' and creates 
    a list containing frequency of appearance ('apps') for each class. The list is
    re-order in descending order, and the fold difference between the adjacent pair
    of frequencies are compared. If the fold difference is above a certain threshold
    ('fold_threshold), the word is classified into a designated class of words.
    When a word is classified as n-class word, it means that the freqs of app of the
    word in n (number of) classes are X fold (fold_threshold) higher than those of 
    other classes. freq_threshold is a cutoff freq of app to decide whether to 
    include the word or not in the list. 
    
    - Number of classes are assumed to be 9.
    - Use get_FoldDifference function to calculate fold difference
     
    INPUTS:
    ========
    frac_docs : list of lists
        A list of lists containing fractions of docs a word appears in the class
    
    OUTPUTS:
    ========
    n_class_words : dictionary
        A dictionary whose keys are n-class_word labels. Values a lists of words
        in each of n classes of words
    '''
    fold_th = fold_threshold
    freq_th = freq_threshold

    ncw_labels = ['one_class_words', 'two_class_words', 'three_class_words', 
                  'four_class_words', 'five_class_words', 'six_class_words', 
                  'seven_class_words', 'eight_class_words','other_words']

    # Create a new dictionary to contain each n-class of words in list formats
    n_class_words = {}
    for i in range(9):
        n_class_words[ncw_labels[i]] = []

    # Get words for each n-class of words (might be a better way to do this?)
    for j, word in enumerate(frac_docs.index):
        apps = np.array(frac_docs.loc[word])
        apps[::-1].sort()
        #print(apps)
        if get_FoldDifference(apps[0], apps[1], freq_threshold=freq_th) >= fold_th:
            n_class_words[ncw_labels[0]].append(word)
        elif get_FoldDifference(apps[1], apps[2], freq_threshold=freq_th) >= fold_th:
            n_class_words[ncw_labels[1]].append(word)
        elif get_FoldDifference(apps[2], apps[3], freq_threshold=freq_th) >= fold_th:
            n_class_words[ncw_labels[2]].append(word)
        elif get_FoldDifference(apps[3], apps[4], freq_threshold=freq_th) >= fold_th:
            n_class_words[ncw_labels[3]].append(word)
        elif get_FoldDifference(apps[4], apps[5], freq_threshold=freq_th) >= fold_th:
            n_class_words[ncw_labels[4]].append(word)
        elif get_FoldDifference(apps[5], apps[6], freq_threshold=freq_th) >= fold_th:
            n_class_words[ncw_labels[5]].append(word)
        elif get_FoldDifference(apps[6], apps[7], freq_threshold=freq_th) >= fold_th:
            n_class_words[ncw_labels[6]].append(word)
        elif get_FoldDifference(apps[7], apps[8], freq_threshold=freq_th) >= fold_th:
            n_class_words[ncw_labels[7]].append(word)
        else:
            n_class_words[ncw_labels[8]].append(word)
    
    # Remove a list of words from one-class words
    one_class_remove_list = ['bunkyo', 'commonest', 'commonplac', 'concret', 'consol',
                             'conspicu', 'credenc', 'damage—unlik', 'drew', 'enumer', 'logo', 
                             'graduat','ibaraki', 'joshi', 'kaneda', 'kurumizaka', 'lesson', 
                             'matsui', 'minami', 'minato', 'montreal', 'newyork', 'ontario', 
                             'shirokanedai', 'sinai', 'taipei', 'wake', 'wise', 'yokohama']
    n_class_words['one_class_words'] = [word for word in n_class_words['one_class_words'] if len(word) > 2]
    
    if print_result:
        print('======== n-class words extractions by fold differecne ========')
        print('Fold Threshold = %f' % fold_th)
        print('Frequency Threshold = %f' % freq_th)
        total = 0
        for i in range(9):
            print('# of words in %s: %d' % (ncw_labels[i], len(n_class_words[ncw_labels[i]])))
            total += len(n_class_words[ncw_labels[i]])
        print('Total # of words: %d' % total)
    
    return n_class_words

# 2. Extraction by exclusive appearance in certain classes
(<a href='#sec0'>Back to top</a>)

In [None]:
def get_nClassWords_byApp(frac_docs, freq_threshold, print_result=True):
    '''
    This function looks at each word in each doc in the 'docs' and creates 
    a list containing frequency of appearance ('apps') for each class. Then,
    the number of non-zero freqs in the list is counted. That number is used
    to classified the word as n-class word. In this case, n-class word is a
    word that appears only in n number of classes. freq_threshold parameter 
    is a cutoff freq of app to decide whether to include the word or not in 
    the list. 
    
    - Number of classes are assumed to be 9.
    
    INPUTS:
    ========
    frac_docs : list of lists
        A list of lists containing fractions of docs a word appears in the class
    
    OUTPUTS:
    ========
    n_class_words : dictionary
        A dictionary whose keys are n-class_word labels. Values a lists of words
        in each of n classes of words
        
    '''
    freq_th = freq_threshold

    ncw_labels = ['one_class_words', 'two_class_words', 'three_class_words', 
                  'four_class_words', 'five_class_words', 'six_class_words', 
                  'seven_class_words', 'eight_class_words','other_words']

    # Create a new dictionary to contain each n-class of words in list formats
    n_class_words = {}
    for i in range(9):
        n_class_words[ncw_labels[i]] = []

    # Get words for each n-class of words
    for j, word in enumerate(frac_docs.index):
        apps = np.array(frac_docs.loc[word])
        num_nonzeros = np.count_nonzero(apps)
        if np.min(apps[nonzero(apps)]) >= freq_th:
            n_class_words[ncw_labels[(num_nonzeros-1)]].append(word)
        else:
            n_class_words[ncw_labels[8]].append(word)
    
    # Remove a list of words from one-class words
    one_class_remove_list = ['bunkyo', 'commonest', 'commonplac', 'concret', 'consol',
                             'conspicu', 'credenc', 'damage—unlik', 'drew', 'enumer', 'logo', 
                             'graduat','ibaraki', 'joshi', 'kaneda', 'kurumizaka', 'lesson', 
                             'matsui', 'minami', 'minato', 'montreal', 'newyork', 'ontario', 
                             'shirokanedai', 'sinai', 'taipei', 'wake', 'wise', 'yokohama']
    n_class_words['one_class_words'] = [word for word in n_class_words['one_class_words'] if len(word) > 2]
    
    if print_result:
        print('======== n-class words extractions by appearances ========')
        print('Frequency Threshold = %f' % freq_th)
        total = 0
        for i in range(9):
            print('# of words in %s: %d' % (ncw_labels[i], len(n_class_words[ncw_labels[i]])))
            total += len(n_class_words[ncw_labels[i]])
        print('Total # of words: %d' % total)
    
    return n_class_words

# 3. Extraction by per-doc appearance fold difference between classes
(<a href='#sec0'>Back to top</a>)

# 4. Extraction by per-doc appearance number difference between classes
(<a href='#sec0'>Back to top</a>)