In [1]:
import pandas as pd
import sqlite3
from convokit import Corpus

In [2]:
PARENT_DIRECTORY = "/home/orlando/anime/"
CONVERSATIONS_PER_SUBREDDIT = 100
UTTERANCES_PER_CONVERSATION = 10

In [3]:
conversations_df = pd.read_json(PARENT_DIRECTORY + "conversations.json", orient = "index")

In [4]:
round(conversations_df.isnull().sum()/len(conversations_df) * 100, 2)

title                 0.00
num_comments          0.00
domain                0.00
timestamp             0.00
subreddit             0.00
gilded                0.00
gildings             96.03
stickied              0.00
author_flair_text     0.00
dtype: float64

In [5]:
conversations_df.isnull().sum()

title                     0
num_comments              0
domain                    0
timestamp                 0
subreddit                 0
gilded                    0
gildings             525741
stickied                  0
author_flair_text         0
dtype: int64

In [6]:
# Remove the column gildings because it contains almost all its values with NaN
conversations_df = conversations_df.drop(columns=["gildings"])

In [7]:
conversations_df["timestamp"] = pd.to_datetime(conversations_df["timestamp"])

In [8]:
# Remove conversations that contains a video or a clip
conversations_df = conversations_df[~conversations_df["title"].str.contains("\[(Video|Clip)\]")]

  conversations_df = conversations_df[~conversations_df["title"].str.contains("\[(Video|Clip)\]")]


In [9]:
# Title should contain "?" to be a question
conversations_df = conversations_df[conversations_df["title"].str.contains('\?')]

In [10]:
# Select the conversation ids with most comments 
conversation_ids_most_comments = conversations_df.sort_values(by=["num_comments", "timestamp"], ascending=False).head(CONVERSATIONS_PER_SUBREDDIT).index

In [11]:
conversation_ids_most_comments

Index(['16tggn', '4m201p', '4fs5z9', '4cb1ct', '3e3awa', '6lk06s', '84dkft',
       '3yk0pd', '83v81r', '8xcnap', '7021ab', '4ve50v', '7x2emz', '6x0jex',
       '42m1md', '6hikxv', '2dzbow', '2aoaar', '3fh2cl', '78xyqr', '3vxji7',
       '1m1kwj', '6ms1xx', '3622dn', '3cjk8w', '7m6k8x', '4gcbrv', '6m6cf3',
       '1kfbra', '8o9qlb', '3t2a3j', '6jo1lf', '7b43uk', '4fz9ln', '6s652d',
       '9d0l62', '32murt', '82rd9m', '6ch31h', '56oqlc', '3r2hb3', '3xjnno',
       '4ctufz', '7cgrw0', '6u4rr3', '4cq3ev', '2zuh2t', '7bqrr0', '4opi7i',
       '6sm2lq', '38gkq4', '2hkia6', '3bi2gp', '8qt4px', '3c3kb7', '7vg4fx',
       '6fywh5', '5emtfa', '43ueqv', '4n1e1z', '43xyt8', '2sd9pg', '3fmgz7',
       '8y0mj5', '37n0z9', '2sf9hk', '5u9udi', '7p11vk', '34pnfh', '695gsj',
       '6rh9j8', '81u7dk', '4jvxom', '651gv9', '1qz18p', '1oxghx', '6ur3qa',
       '36rrm7', '13r5gn', '2wb8qn', '5o1d1t', '3jwt12', '372f3h', '32t8vn',
       '4jmds4', '9ifdl6', '9aigk9', '50j2m5', '7a9flf', '379e1q', '7kncn2',

In [12]:
# Create a file to be accessed from the bash script
with open(PARENT_DIRECTORY + "conversation_ids_most_comments.txt", "w") as file:
    file.write(str(conversation_ids_most_comments))

Shell script to filter utterances with conversation ids with most comments

In [13]:
%%sh

PARENT_DIRECTORY=/home/orlando/anime/

conversation_ids_most_comments_formatted="$(cat ${PARENT_DIRECTORY}conversation_ids_most_comments.txt | perl -p -e 's/\n//g' | grep -Po '(?<=\[)[^]]+' | perl -p -e "s/( |')//g" | perl -p -e 's/,/|/g' | perl -p -e 's/^(.)/(\1/g' | perl -p -e 's/(.)$/\1)/g')"
grep -P "\"root\": ?\"$conversation_ids_most_comments_formatted\"" ${PARENT_DIRECTORY}utterances.jsonl > ${PARENT_DIRECTORY}utterances_most_comments.jsonl

In [14]:
corpus = Corpus(filename = PARENT_DIRECTORY + "utterances_most_comments.jsonl")
utterances_df = corpus.get_utterances_dataframe()







IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [15]:
round(utterances_df.isnull().sum()/len(utterances_df) * 100, 2)

timestamp                  0.00
text                       0.00
speaker                    0.00
reply_to                   0.09
conversation_id            0.00
meta.score                 0.00
meta.top_level_comment     0.09
meta.retrieved_on          0.00
meta.gilded                0.00
meta.gildings             97.40
meta.subreddit             0.00
meta.stickied              0.00
meta.permalink             0.00
meta.author_flair_text     0.00
vectors                    0.00
dtype: float64

In [16]:
utterances_df.isnull().sum()

timestamp                      0
text                           0
speaker                        0
reply_to                     100
conversation_id                0
meta.score                     0
meta.top_level_comment       100
meta.retrieved_on              0
meta.gilded                    0
meta.gildings             114358
meta.subreddit                 0
meta.stickied                  0
meta.permalink                 0
meta.author_flair_text         0
vectors                        0
dtype: int64

In [17]:
# Column meta.gildings contains almost all its values with NaN
utterances_df = utterances_df.drop(columns=["meta.gildings"])

In [18]:
# Create another dataframe to preserve starter utterances
starter_utterances_df = utterances_df[utterances_df["reply_to"].isnull()]
utterances_df = utterances_df[utterances_df["reply_to"].notnull()]

In [19]:
# Remove rows that at least one element is equal to "[deleted]" or to "[removed]"
utterances_df = utterances_df[~(utterances_df.eq("[deleted]").any(axis="columns")) & ~(utterances_df.eq("[removed]").any(axis="columns"))]

In [20]:
utterances_df = utterances_df[~utterances_df["text"].str.contains("https?://")]

In [21]:
unique_conversation_ids = utterances_df["conversation_id"].unique()

In [22]:
unique_conversation_ids

array(['13r5gn', '16tggn', '1kfbra', '1m1kwj', '1oxghx', '1qz18p',
       '2aoaar', '2dzbow', '2hkia6', '2ok8si', '2sd9pg', '2sf9hk',
       '2wb8qn', '2zuh2t', '32murt', '32t8vn', '34pnfh', '35v62w',
       '3622dn', '36rrm7', '372f3h', '379e1q', '37n0z9', '38gkq4',
       '3bi2gp', '3c3kb7', '3cjk8w', '3e3awa', '3fh2cl', '3fmgz7',
       '3fr4hs', '3jwt12', '3r2hb3', '3t2a3j', '3vxji7', '3xjnno',
       '3yk0pd', '42m1md', '43ueqv', '43xyt8', '4b3amy', '4cb1ct',
       '4cq3ev', '4ctufz', '4fs5z9', '4fz9ln', '4gcbrv', '4gcl1c',
       '4gq22k', '4jmds4', '4jvxom', '4m201p', '4n1e1z', '4opi7i',
       '4ve50v', '50j2m5', '56oqlc', '5emtfa', '5n70re', '5o1d1t',
       '5u9udi', '5uc6hq', '651gv9', '695gsj', '6ch31h', '6fywh5',
       '6hikxv', '6jo1lf', '6jtrw1', '6lk06s', '6m6cf3', '6ms1xx',
       '6rh9j8', '6s652d', '6sm2lq', '6u4rr3', '6ur3qa', '6x0jex',
       '7021ab', '78xyqr', '7a9flf', '7b43uk', '7bqrr0', '7cgrw0',
       '7kncn2', '7m6k8x', '7p11vk', '7vg4fx', '7x2emz', '81u7

In [23]:
# Remove nested utterances
for conversation_id in unique_conversation_ids:
    # Top level utterances with most score
    utterances_df.loc[utterances_df["conversation_id"] == conversation_id] = utterances_df[(utterances_df["conversation_id"] == conversation_id) & (utterances_df["reply_to"] == conversation_id)].sort_values(by=["meta.score", "timestamp"], ascending=False).head(UTTERANCES_PER_CONVERSATION)

In [24]:
# Put starter utterances to the original dataframe
utterances_df = pd.concat([starter_utterances_df, utterances_df])

In [25]:
# There are a lot of rows with NaN values because I left only top level utterances
utterances_df = utterances_df.dropna(how = "all")

In [26]:
# Column "vectors" cannot be saved in database 
utterances_df = utterances_df.drop(columns=["vectors"])

In [27]:
# 1 is the starter utterance
if len(utterances_df.index) == CONVERSATIONS_PER_SUBREDDIT * (UTTERANCES_PER_CONVERSATION + 1):
    if len(utterances_df[utterances_df["reply_to"].isnull()].index) == CONVERSATIONS_PER_SUBREDDIT:
        print("utterances_df is correct")
    else:
        print("utterances_df is incorrect")

utterances_df is correct


In [28]:
database = PARENT_DIRECTORY + "reddit_db.sqlite"
conn = sqlite3.connect(database)

In [29]:
conversations_df.to_sql(name="Conversations", con=conn)
utterances_df.to_sql(name="Utterances", con=conn)
conn.close()