In [1]:
import pandas as pd

In [2]:
qajokes = pd.read_csv("../data/qajokes1.1.2.csv", usecols=['Question', 'Answer'])
nosubj = pd.read_csv('../data/t_nosubject.csv', usecols=['Question', 'Answer'])

In [3]:
print(len(qajokes))
print(len(nosubj))

75114
32120


In [4]:
wordball = pd.concat([qajokes, nosubj], ignore_index=True)
wordball.head()

Unnamed: 0,Question,Answer
0,What's the best anti diarrheal prescription?,Mycheexarphlexin
1,What do you call a person who is outside a doo...,Matt
2,Which Star Trek character is a member of the m...,Jean-Luc Pickacard
3,What's the difference between a bullet and a h...,A bullet doesn't miss Harambe
4,Why was the Ethiopian baby crying?,He was having a mid-life crisis


In [5]:
wordball.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 107234 entries, 0 to 107233
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   Question  107234 non-null  object
 1   Answer    103101 non-null  object
dtypes: object(2)
memory usage: 1.6+ MB


Preprocessing

In [6]:
wordball = wordball.applymap(str)

In [7]:
def distinct_chars(data, cols):
    if cols is None:
        cols = list(data.columns)
        
    questions = ' '.join(data[cols[0]])
    answers = ' '.join(data[cols[1]])
    
    dis_chars = set(questions + answers)
    print("Number of distinct characters used in the dataset: {}".format(len(dis_chars)))
    
    dis_chars = list(dis_chars)
    
    digits = [char for char in dis_chars if char.isdigit()]
    alphabets = [char for char in dis_chars if char.isalpha()]
    special = [char for char in dis_chars if char not in digits and alphabets]
    
    digits = sorted(digits)
    alphabets = sorted(alphabets)
    special = sorted(special)
    
    print("Digits: {}".format(digits))
    print("Alphabets: {}".format(alphabets))
    print("Special characters: {}".format(special))

In [8]:
distinct_chars(wordball, ['Question', 'Answer'])

Number of distinct characters used in the dataset: 120
Digits: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
Alphabets: ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'ß', 'è', 'é', 'ñ', 'ó', 'ö', 'ü']
Special characters: [' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', '\xa0', '¡', '¤', '«', '°', '»', '¿', 'ß', 'è', 'é', 'ñ', 'ó', 'ö', 'ü', '\u200b', '–', '—', '‘', '’', '“', '”', '…', '™', '\ufeff',

In [15]:
import re 

def clean_text(text):
    text = str(text)
    
    text = re.sub('\"', '\'', text)
    text = re.sub("“", '\'', text)
    text = re.sub("”", '\'', text)
    text = re.sub('’', '\'', text)
    text = re.sub('\[', '(', text)
    text = re.sub('\]', ')', text)
    text = re.sub('\{', '(', text)
    text = re.sub('\}', ')', text)
    
    text = re.sub("([?.!,:;'?!+\-*/=%$@&()])", r" \1 ", text)
    pattern = re.compile('[^a-zA-Z0-9_\.\,\:\;\'\?\!\+\-\*\/\=\%\$\@\&\(\)]')
    text = re.sub(pattern, ' ', text)
    
    text = text.lower()
    text = re.sub(' +', ' ', text)
    text = text.strip()
    
    return text

In [16]:
def print_question_answer(df, index, cols):
    print(f"Question: ({index})")
    print(df.loc[index][cols[0]])
    print(f"Answer: ({index})")
    print(df.loc[index][cols[1]])

In [17]:
print_question_answer(wordball, 94351, ['Question', 'Answer'])

Question: (94351)
(thomas edison prank call ) is your refrigerator running
Answer: (94351)
'yes . . ' you 're welcome ! *click *


In [18]:
wordball = wordball.applymap(clean_text)

In [19]:
print_question_answer(wordball, 94351, ['Question', 'Answer'])

Question: (94351)
( thomas edison prank call ) is your refrigerator running
Answer: (94351)
' yes . . ' you ' re welcome ! * click *


In [20]:
def preprocess(data, cols):
    data_len_before = len(data)
    print(f"Number of examples before removing duplicates: {data_len_before}")
    data = data.drop_duplicates(keep='first')
    data_len_after = len(data)
    print(f"Number of examples after removing duplicates: {data_len_after}")
    print(f"Number of duplicates removed: {data_len_before-data_len_after}")
    
    if cols is None:
        cols = list(data.columns)
    
    data_len_before = len(data)
    print(f"Number of examples before removing rows with empty strings: {data_len_before}")
    data = data[(data[cols[0]] != "") & (data[cols[1]] != "")]
    data_len_after = len(data)
    print(f"Number of examples after removing the empty strings: {data_len_after}")
    print(f"Number of removed empty strings: {data_len_before-data_len_after}")
    
    def accepted_length(qa_pair):
        q_len = len(qa_pair[0].split(' '))
        a_len = len(qa_pair[1].split(' '))
        if (q_len <= 30) & ((a_len <= 30) & (len(qa_pair[1]) > 1)):
            return True
        return False 
    
    data_len_before = len(data)
    print(f"Number of examples before removing rows with more than 30 words: {data_len_before}")
    accepted_mask = data.apply(accepted_length, axis=1)
    data = data[accepted_mask]
    data_len_after = len(data)
    print(f"Number of examples after removing rows with more than 30 words: {data_len_after}")
    print(f"Number of removed rows with more than 30 words: {data_len_before-data_len_after}")
    
    return data

In [21]:
wordball = preprocess(wordball, ['Question', 'Answer'])

Number of examples before removing duplicates: 107234
Number of examples after removing duplicates: 107144
Number of duplicates removed: 90
Number of examples before removing rows with empty strings: 107144
Number of examples after removing the empty strings: 107054
Number of removed empty strings: 90
Number of examples before removing rows with more than 30 words: 107054
Number of examples after removing rows with more than 30 words: 101712
Number of removed rows with more than 30 words: 5342


In [22]:
len(wordball)

101712

In [23]:
distinct_chars(wordball, ['Question', 'Answer'])

Number of distinct characters used in the dataset: 56
Digits: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
Alphabets: ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
Special characters: [' ', '!', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '=', '?', '@', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


Second Dataset

In [47]:
reddit_jokes = pd.read_csv('../data/jokes_score_name_clean.csv', usecols=['q', 'a'])

In [48]:
reddit_jokes.rename(columns={'q': 'Question', 'a': 'Answer'}, inplace=True)

In [49]:
reddit_jokes.head()

Unnamed: 0,Question,Answer
0,I enjoy working in a slaughterhouse..,Everything is so cut and dry.
1,What do you call a soldier who survives Mustar...,A seasoned veteran.
2,I really like white dwarf stars...,...My favorite is Peter Dinklage.
3,Knock knock. Whose their?,The grammar police.
4,What breaks when you give it to a twelve year ...,Her hips.


In [50]:
len(reddit_jokes)

133328

In [51]:
reddit_jokes = reddit_jokes.applymap(str)

In [52]:
distinct_chars(reddit_jokes, reddit_jokes.columns)

Number of distinct characters used in the dataset: 567
Digits: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '²', '³', '¹', '₂', '₄']
Alphabets: ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'µ', 'º', 'Ä', 'Ñ', 'Ö', 'ß', 'à', 'á', 'ã', 'ä', 'å', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï', 'ñ', 'ò', 'ó', 'ô', 'õ', 'ö', 'ø', 'ù', 'ú', 'û', 'ü', 'þ', 'ā', 'ē', 'ě', 'ı', 'ń', 'ō', 'œ', 'ƃ', 'Ɔ', 'ǎ', 'ǐ', 'ǒ', 'ǚ', 'ǝ', 'ɐ', 'ɑ', 'ɔ', 'ə', 'ɟ', 'ɡ', 'ɥ', 'ɪ', 'ɯ', 'ɴ', 'ɹ', 'ɾ', 'ʇ', 'ʌ', 'ʍ', 'ʎ', 'ʏ', 'ʖ', 'ʘ', 'ʞ', 'ʟ', 'ʰ', 'ʲ', 'ʳ', 'ʷ', 'ʸ', 'ˈ', 'ˢ', 'Δ', 'Π', 'Σ', 'ί', 'α', 'κ', 'λ', 'μ', 'ν', 'π', 'ρ', 'ω', 'ϱ', 'А', 'Д', 'К', 'Т', 'а', 'е', 'л', 'м', 'о', 'т', 'ш', 'я', 'Ԁ', 'א', 'ב', 'ג', 'ה', 'ו', 'ז', 'ח', 'ט', 'י', 'ך', 'כ', 'ל', 'ם', 'ן', 'נ', 'ע', 'פ',

In [53]:
def clean_reddit_tags(data, cols):
    if cols is None:
        cols = list(data.columns)
        
    data_copy = data.copy()
    data_copy[cols[0]] = data_copy[cols[0]].str.lower()
    data_copy[cols[1]] = data_copy[cols[1]].str.lower()
    
    mask_set = ['[removed]', '[deleted]', '[censored]']
    mask = data_copy.apply(
        lambda qa_pair: False if (qa_pair[0] in mask_set) | (qa_pair[1] in mask_set) else True,
        axis=1
    )
    before = len(data)
    data = data[mask]
    after = len(data)
    print(f"Number of rows dropped with [deleted], [removed] or [censored] tags: {before-after}")
    
    def sub_tag(pair):
        p = re.compile("\[(.*?)\]")
        pair[0] = re.sub(p, " ", pair[0])
        pair[1] = re.sub(p, ' ', pair[1])
        
        return pair 
    
    data = data.apply(sub_tag, axis=1)
    return data

In [54]:
reddit_jokes = clean_reddit_tags(reddit_jokes, reddit_jokes.columns)

Number of rows dropped with [deleted], [removed] or [censored] tags: 211


In [55]:
reddit_jokes.head()

Unnamed: 0,Question,Answer
0,I enjoy working in a slaughterhouse..,Everything is so cut and dry.
1,What do you call a soldier who survives Mustar...,A seasoned veteran.
2,I really like white dwarf stars...,...My favorite is Peter Dinklage.
3,Knock knock. Whose their?,The grammar police.
4,What breaks when you give it to a twelve year ...,Her hips.


In [56]:
reddit_jokes = reddit_jokes.applymap(clean_text)

In [57]:
reddit_jokes = preprocess(reddit_jokes, reddit_jokes.columns)

Number of examples before removing duplicates: 133117
Number of examples after removing duplicates: 128037
Number of duplicates removed: 5080
Number of examples before removing rows with empty strings: 128037
Number of examples after removing the empty strings: 127947
Number of removed empty strings: 90
Number of examples before removing rows with more than 30 words: 127947
Number of examples after removing rows with more than 30 words: 89002
Number of removed rows with more than 30 words: 38945


In [58]:
distinct_chars(reddit_jokes, reddit_jokes.columns)

Number of distinct characters used in the dataset: 56
Digits: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
Alphabets: ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
Special characters: [' ', '!', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '=', '?', '@', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


Third Dataset

In [59]:
jokes = pd.read_csv('../data/jokes.csv', usecols=['Question', 'Answer'])
jokes.head()

Unnamed: 0,Question,Answer
0,Did you hear about the Native American man tha...,He nearly drown in his own tea pee.
1,What's the best anti diarrheal prescription?,Mycheexarphlexin
2,What do you call a person who is outside a doo...,Matt
3,Which Star Trek character is a member of the m...,Jean-Luc Pickacard
4,What's the difference between a bullet and a h...,A bullet doesn't miss Harambe


In [61]:
jokes = jokes.applymap(str)

In [62]:
distinct_chars(jokes, jokes.columns)

Number of distinct characters used in the dataset: 237
Digits: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '³', '౪', '₄']
Alphabets: ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'È', 'É', 'Ñ', 'ß', 'á', 'ä', 'å', 'æ', 'è', 'é', 'ê', 'ì', 'í', 'î', 'ï', 'ñ', 'ò', 'ó', 'õ', 'ö', 'ø', 'ù', 'ú', 'û', 'ü', 'Đ', 'ı', 'ō', 'œ', 'ʃ', 'ʅ', 'ʖ', 'Α', 'Μ', 'Ω', 'ά', 'ε', 'ζ', 'η', 'θ', 'κ', 'μ', 'π', 'ρ', 'ς', 'С', 'б', 'е', 'и', 'н', 'р', 'т', 'ь', 'ॐ', 'ಠ', 'ứ', 'づ', 'ツ', '丁', '二', '喲', '媽', '崇', '常', '清', '胖', '董', '這', '麼', '빵']
Special characters: [' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W'

In [63]:
def remove_prefixes(pair):
    if ('Q:' in pair[0]) and ('A:' in pair[1]) and ('Q:' not in pair[1]):
        pair[0] = pair[0].replace('Q:', '')
        pair[1] = pair[1].replace('A:', '')
    elif ('A:' in pair[1]) and ('Q:' in pair[1]):
        pair[0] = pair[0].replace('Q:', '')
        q_start = pair[1].find('Q:') + 2
        q_end = pair[1].find('A:')
        q_text = pair[1][q_start:q_end].strip()
        if q_text == pair[0].strip():
            pair[1] = pair[1][q_end+2:].strip()
    
    return pair 

In [64]:
jokes = jokes.apply(remove_prefixes, axis=1)

In [65]:
jokes.head()

Unnamed: 0,Question,Answer
0,Did you hear about the Native American man tha...,He nearly drown in his own tea pee.
1,What's the best anti diarrheal prescription?,Mycheexarphlexin
2,What do you call a person who is outside a doo...,Matt
3,Which Star Trek character is a member of the m...,Jean-Luc Pickacard
4,What's the difference between a bullet and a h...,A bullet doesn't miss Harambe


In [66]:
jokes = jokes.applymap(clean_text)

In [67]:
jokes = preprocess(jokes, jokes.columns)

Number of examples before removing duplicates: 38269
Number of examples after removing duplicates: 38187
Number of duplicates removed: 82
Number of examples before removing rows with empty strings: 38187
Number of examples after removing the empty strings: 38166
Number of removed empty strings: 21
Number of examples before removing rows with more than 30 words: 38166
Number of examples after removing rows with more than 30 words: 37086
Number of removed rows with more than 30 words: 1080


In [68]:
distinct_chars(jokes, jokes.columns)

Number of distinct characters used in the dataset: 56
Digits: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
Alphabets: ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
Special characters: [' ', '!', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '=', '?', '@', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


Final compilation

In [69]:
dataset = pd.concat([wordball, reddit_jokes, jokes], ignore_index=True)
dataset.head()

Unnamed: 0,Question,Answer
0,what ' s the best anti diarrheal prescription ?,mycheexarphlexin
1,what do you call a person who is outside a doo...,matt
2,which star trek character is a member of the m...,jean - luc pickacard
3,what ' s the difference between a bullet and a...,a bullet doesn ' t miss harambe
4,why was the ethiopian baby crying ?,he was having a mid - life crisis


In [70]:
len(dataset)

227800

In [71]:
dataset = dataset.drop_duplicates(keep='first')

In [72]:
dataset.dropna(inplace=True)

In [73]:
len(dataset)

175672

In [74]:
dataset = dataset.applymap(str)

In [75]:
dataset.to_csv('../data/dataset.csv')

In [None]:
https://github.com/sarvasvarora/sarcasm-generator/blob/main/dataset.py