# Set Random Training Dataset

In [1]:
import os, sys, time, csv, random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from IPython.display import Markdown, display
def printmd(string):
    display(Markdown(string))
#printmd('**bold**')

In [3]:
data_path                = "../mlabel_corpora/JokeHumorLevel.txt"
data_OneHot_path         = "../mlabel_corpora/JokeHumorLevel_OneHot.txt"
train_OneHot_path        = "../mlabel_corpora/JokeHumorLevel_train_OneHot.txt"
train_OneHot_random_path = "../mlabel_corpora/JokeHumorLevel_train_OneHot_random.txt"
test_OneHot_path         = "../mlabel_corpora/JokeHumorLevel_test_OneHot.txt"

In [4]:
# set global variables: df
df = pd.read_csv(data_path, delimiter="\t")
#data_raw = df.loc[np.random.choice(data_raw.index, size=2000)]
print(df.shape) # same as data_raw.shape in Jupyter

(3365, 4)


In [5]:
# Convert the HumorLevel into one-hot encoding. Refer to:
#    https://stackoverflow.com/a/39287161/8583170
# Get one hot encoding of columns B
one_hot = pd.get_dummies(df['HumorLevel'])
# Drop column HumorLevel as it is now encoded
#df = df.drop('HumorLevel',axis = 1)
# Join the encoded df
df = df.join(one_hot)
df.head()

Unnamed: 0,ID,Title,Content,HumorLevel,1,2,3,4,5
0,L0001,要求加薪,員工：老闆，您必須幫我加薪，已經有三家公司在找我了！ 老闆：哪三家？ 員工：...,4,0,0,0,1,0
1,L0002,查無此人,某市政府辦公大樓落成，門口缺副對聯。 副市長揮毫 上聯：說實話辦實事一身正氣...,3,0,0,1,0,0
2,L0003,遣散費,中午老闆視察自己的建築工地時，發現有個人在角落玩手機。 老闆：你月薪多少？ ...,4,0,0,0,1,0
3,L0004,職業習慣,一天，一位法官的女友看見兩個蚊子，便叫法官打死。 只見法官只把那個肚子飽飽的蚊子打死...,2,0,1,0,0,0
4,L0005,美女吵架,辦公室中兩位女同事吵起來了。 經理忍無可忍：「太不像話了！現在是什麼情況？你們把原因...,4,0,0,0,1,0


In [6]:
from sklearn.model_selection import train_test_split

# ID=L1850 為分界，之前：吳玟萱，之後：黃亭筠，均為中文系同一屆
train, test = train_test_split(df, train_size=1691, shuffle=False) 
# (tempararily) set global variables: train, test 

print(train.shape)
print(test.shape)

(1691, 9)
(1674, 9)


In [7]:
with open(data_OneHot_path, 'w') as outF:
    outF.write(df.to_csv(sep='\t', index=False))
with open(train_OneHot_path, 'w') as outF:
    outF.write(train.to_csv(sep='\t', index=False))
with open(test_OneHot_path, 'w') as outF:
    outF.write(test.to_csv(sep='\t', index=False))

## Now randomly assign labels to the training set and save it to file

1. First, to know the original training label distribution.
2. Set the labels based on this distribution, by apply an algorithm from stackoverflow.com

In [8]:
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.copy.html
train_org = train.copy()

In [9]:
Total_Samples = len(train.index) # train.shape[0] # both work, but len(train.index) is faster
cat = list(train.columns.values)
print(Total_Samples, "\n", cat)
train.groupby(['HumorLevel']).count()


1691 
 ['ID', 'Title', 'Content', 'HumorLevel', 1, 2, 3, 4, 5]


Unnamed: 0_level_0,ID,Title,Content,1,2,3,4,5
HumorLevel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,47,47,47,47,47,47,47,47
2,251,251,251,251,251,251,251,251
3,742,742,742,742,742,742,742,742
4,604,604,604,604,604,604,604,604
5,47,47,47,47,47,47,47,47


In [10]:
matrix = train.iloc[:, 4:].values # get the values of the labels in matrix form
'''
matrix = np.array( # for testing
    [[0, 0, 1], 
     [1, 1, 0], 
     [1, 0, 0]]
)
'''
matrix

array([[0, 0, 0, 1, 0],
       [0, 0, 1, 0, 0],
       [0, 0, 0, 1, 0],
       ...,
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0]], dtype=uint8)

In [11]:
# Randomize matrix keeping row and column totals the same
# https://stackoverflow.com/questions/2133268/randomize-matrix-in-perl-keeping-row-and-column-totals-the-same

def shuffle(array):
    i = len(array)
    j = 0
    for _ in (array):
        i -= 1;
        j = random.randrange(0, i+1) #int rand($i + 1);
        #print('arrary:', array)
        #print(f'len(array)={len(array)}, (i, j)=({i}, {j})')
        if i != j: 
            tmp = array[i]
            array[i] = array[j]
            array[j] = tmp
    return array

def other_edits(matrix, cell, step, last_j):
    # We have succeeded if we've already made 3 edits.
    step += 1
    if step > 3: 
        return True

    # Determine the roster of next edits to fix the row or
    # column total upset by our prior edit.
    (i, j) = cell
    fixes = []
    if (step == 1):
        fixes = [[i, x] for x in range(len(matrix[0])) if x != j and not matrix[i][x] ]
        fixes = shuffle(fixes)
    elif (step == 2):
        fixes = [[x, j] for x in range(len(matrix)) if x != i and matrix[x][j]]
        fixes = shuffle(fixes)
    else:
        # On the last edit, the column of the fix must be
        # the same as the column of the initial edit.
        if not matrix[i][last_j]: fixes = [[i, last_j]]

    for f in (fixes):
        # If all subsequent fixes succeed, we are golden: make
        # the current fix and return true.
        if ( other_edits(matrix, f, step, last_j) ):
            matrix[f[0]][f[1]] = 0 if step == 2 else 1
            return True

    # Failure if we get here.
    return False # return False

def cells_to_move(matrix):
    # Returns a list of non-empty cells.
    i = -1
    cells = []
    for row in matrix:
        i += 1;
        for j in range(len(row)):
            if matrix[i][j]: cells.append([i, j])
    return cells

def edit_matrix(matrix):
    # Takes a matrix and moves all of the non-empty cells somewhere else.
    move_these = cells_to_move(matrix)
    for cell in move_these:
        (i, j) = cell
        # Move the cell, provided that the cell hasn't been moved
        # already and the subsequent edits don't lead to a dead end.
        if matrix[i][j] and other_edits(matrix, cell, 0, j):
            matrix[i][j] = 0
    return matrix

def init_matrix(rows, cols, density): # not used
    matrix = []
    for r in range(rows):
        matrix.append([ 1 if random.random() < density else 0  for _ in range(cols) ])
    return matrix

def Shuffle_Matrix(matrix, N, M, n_iter):
    #matrix = init_matrix(N, M, density)
    print("init_matrix:\n", matrix);
    for n in range(n_iter):
        print(f'iteration: {n+1}') # Show progress.
        matrix = edit_matrix(matrix)
        #print('matrix:\n', matrix)
    return matrix

def compute_density(train_org, cat):
    sum = 0
    for category in cat[3:]:
        sum += train_org[category].sum()
    density = sum/(train_org.shape[0] * train_org.shape[1])
    print(f'sum={sum}, density={density}')
    return density

print(matrix.shape[0], matrix.shape[1]) #;print(matrix)
#density = compute_density(train_org, cat)
#density = sum([sum(row) for row in matrix])/(matrix.shape[0] * matrix.shape[1])
#print('density : %1.6f'%density)

# Args: N rows, N columns, density, N iterations.
matrix2 = Shuffle_Matrix(matrix, matrix.shape[0], matrix.shape[1], 11) 
# even n_iter number would be the same for the toy 3x3 example
# n_iter =  1, both==380, 370
# n_iter =  5, both==349, 363, 371, 402, 390, 401, 371, 366, 381, 373, 389, 410, 384
# n_iter = 10, both==381, 392, 393, 364, 402, 399, 406, 386, 380
# n_iter = 11, both==350, 364, 362
# n_iter = 20, both==374, 356, 368
# n_iter = 21, both==396, 337, 379, 368
# n_iter = 30, both==384, 379
print("matrix2:\n", matrix2)

1691 5
init_matrix:
 [[0 0 0 1 0]
 [0 0 1 0 0]
 [0 0 0 1 0]
 ...
 [0 0 0 1 0]
 [0 0 0 1 0]
 [0 0 0 1 0]]
iteration: 1
iteration: 2
iteration: 3
iteration: 4
iteration: 5
iteration: 6
iteration: 7
iteration: 8
iteration: 9
iteration: 10
iteration: 11
matrix2:
 [[0 0 0 1 0]
 [0 0 1 0 0]
 [0 0 1 0 0]
 ...
 [0 1 0 0 0]
 [0 1 0 0 0]
 [0 1 0 0 0]]


In [12]:
# set matrix2 to train
j = -1
for c in cat[4:]:
    j += 1
    for i in range(Total_Samples):
        train.at[i, c] = matrix2[i][j]

In [13]:
def dataframe_difference(df1, df2, which=None):
    """Find rows which are different between two DataFrames."""
    comparison_df = df1.merge(df2,
                              indicator=True,
                              how='outer')
    if which is None:
        diff_df = comparison_df[comparison_df['_merge'] != 'both']
    else:
        diff_df = comparison_df[comparison_df['_merge'] == which]
    #diff_df.to_csv('data/diff.csv')
    return diff_df

diff_both = dataframe_difference(train_org, train, which='both')
both=diff_both['_merge'].value_counts(); print(both)
diff = dataframe_difference(train_org, train)
one = diff['_merge'].value_counts(); print(one)
print(train.shape[0], both['both'], one['right_only'], one['left_only'])
print(both['both']+one['right_only'], both['both']/train.shape[0])

both          545
left_only       0
right_only      0
Name: _merge, dtype: int64
left_only     1146
right_only    1146
both             0
Name: _merge, dtype: int64
1691 545 1146 1146
1691 0.322294500295683


### 1.1. Checking for missing values

In [14]:
missing_values_check = train_org.isnull().sum()
print(missing_values_check, "\n")
missing_values_check = train.isnull().sum()
print(missing_values_check)

ID            0
Title         0
Content       0
HumorLevel    0
1             0
2             0
3             0
4             0
5             0
dtype: int64 

ID            0
Title         0
Content       0
HumorLevel    0
1             0
2             0
3             0
4             0
5             0
dtype: int64


### 1.2. Calculating number of jokes under each label

In [15]:
# Jokes with no label are considered to be clean jokes.
# Creating seperate column in dataframe to identify clean jokes.
# We use axis=1 to count row-wise and axis=0 to count column wise
def print_empty_label(df, s):
    rowSums = df.iloc[:,3:].sum(axis=1)
    #print(rowSums.shape)
    #print(rowSums.head())
    clean_comments_count = (rowSums==0).sum(axis=0)

    print(f"Total number of {s} jokes = ",len(df))
    print(f"Number of clean jokes in {s} = ",clean_comments_count)
    print(f"Number of {s} jokes with labels =",(len(df)-clean_comments_count))
    print()

In [16]:
print(train_org.equals(train))
print_empty_label(train_org, 'train_org')
print_empty_label(train, 'train')

False
Total number of train_org jokes =  1691
Number of clean jokes in train_org =  0
Number of train_org jokes with labels = 1691

Total number of train jokes =  1691
Number of clean jokes in train =  0
Number of train jokes with labels = 1691



In [17]:
# set global variables: categories
categories = list(df.columns.values)
print(categories)
categories = categories[4:]
print(categories)

['ID', 'Title', 'Content', 'HumorLevel', 1, 2, 3, 4, 5]
[1, 2, 3, 4, 5]


In [18]:
# Calculating number of jokes in each category
def print_category_count(df, categories):
    counts = []
    for category in categories:
        counts.append((category, df[category].sum()))
    df_stats = pd.DataFrame(counts, columns=['category', 'number of jokes'])
    print(df_stats)
    print()

In [19]:
print_category_count(df, categories)
print_category_count(train_org, categories)
print_category_count(train, categories)
print_category_count(test, categories)

   category  number of jokes
0         1              363
1         2              867
2         3             1313
3         4              729
4         5               93

   category  number of jokes
0         1               47
1         2              251
2         3              742
3         4              604
4         5               47

   category  number of jokes
0         1               47
1         2              251
2         3              742
3         4              604
4         5               47

   category  number of jokes
0         1              316
1         2              616
2         3              571
3         4              125
4         5               46



In [20]:
def plot_category_count(df, categories):
    sns.set(font_scale = 2)
    plt.figure(figsize=(15,8))

    ax= sns.barplot(categories, df.iloc[:,3:].sum().values)

    plt.title("Jokes in each category", fontsize=24)
    plt.ylabel('Number of jokes', fontsize=18)
    plt.xlabel('Joke Skill', fontsize=18)

    #adding the text labels
    rects = ax.patches
    #print(rects)
    labels = df.iloc[:,3:].sum().values
    #print(labels)
    for rect, label in zip(rects, labels):
        height = rect.get_height()
        ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom', fontsize=18)

    plt.show()

In [21]:
#plot_category_count(df, categories)

### 1.3. Calculating number of jokes having multiple labels

In [22]:
def plot_multiple_label(mlc_labels, multiLabel_counts):
    sns.set(font_scale = 2)
    plt.figure(figsize=(15,8))

    ax = sns.barplot(mlc_labels, multiLabel_counts.values)

    plt.title("Jokes having multiple labels ")
    plt.ylabel('Number of jokes', fontsize=18)
    plt.xlabel('Number of labels', fontsize=18)

    #adding the text labels
    rects = ax.patches
    labels = multiLabel_counts.values
    for rect, label in zip(rects, labels):
        height = rect.get_height()
        ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom')

    plt.show()

In [23]:
def print_multiple_label(df):
    rowSums = df.iloc[:,4:].sum(axis=1)
    multiLabel_counts = rowSums.value_counts()
    print(multiLabel_counts)
    multiLabel_counts = multiLabel_counts.iloc[:]
    #print(multiLabel_counts.index)
    mlc_labels = ['L'+str(i) for i in multiLabel_counts.index]
    print(mlc_labels)
    
    #plot_multiple_label(mlc_labels, multiLabel_counts)
    ##return(mlc_labels, multiLabel_counts)

In [24]:
print_multiple_label(df)
print_multiple_label(train_org)
print_multiple_label(train)

1    3365
dtype: int64
['L1']
1    1691
dtype: int64
['L1']
1    1691
dtype: int64
['L1']


## Write the result to a file

In [25]:
with open(train_OneHot_random_path, 'w') as outF:
    outF.write(train.to_csv(sep='\t', index=False))