In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Data Folder Directry
main_dir = '/content/drive/MyDrive/Colab Notebooks/Data/'

# Cornell Movie-Dialog Corpus

In [None]:
import os
import pandas as pd

In [None]:
line_sub_dir = 'Cornell_Movie_Dialog_Corpus/movie_lines.txt'
conv_sub_dir = 'Cornell_Movie_Dialog_Corpus/movie_conversations.txt'
full_line_file_path = os.path.join(main_dir, line_sub_dir)
full_conv_file_path = os.path.join(main_dir, conv_sub_dir)

In [None]:
# Load movie lines
lines_columns = ['lineID', 'characterID', 'movieID', 'characterName', 'text']
df_lines = pd.read_csv(full_line_file_path, sep=' \+\+\+\$\+\+\+ ', engine='python', names=lines_columns, encoding='ISO-8859-1')
df_lines.set_index('lineID', inplace=True)

# Load conversations
conversations_columns = ['character1ID', 'character2ID', 'movieID', 'utteranceIDs']
df_conversations = pd.read_csv(full_conv_file_path, sep=' \+\+\+\$\+\+\+ ', engine='python', names=conversations_columns, encoding='ISO-8859-1')


In [None]:
conversations = []

for index, row in df_conversations.iterrows():
    # Convert the utterance IDs from string to list of IDs
    utterance_ids = eval(row['utteranceIDs'])
    for i in range(len(utterance_ids) - 1):
        # Check if utterance IDs exist in df_lines
        if utterance_ids[i] in df_lines.index and utterance_ids[i+1] in df_lines.index:
            # Use utterance_id[i] as the input and utterance_id[i+1] as the response
            input_text = df_lines.loc[utterance_ids[i]]['text']
            response_text = df_lines.loc[utterance_ids[i+1]]['text']
            # Additional check if the texts are not None
            if input_text and response_text:
                conversations.append([input_text.strip(), response_text.strip()])

# Convert to DataFrame
df_convo = pd.DataFrame(conversations, columns=['input', 'response'])

# Check the reconstructed conversations
df_convo.head()



Unnamed: 0,input,response
0,Can we make this quick? Roxanne Korrine and A...,"Well, I thought we'd start with pronunciation,..."
1,"Well, I thought we'd start with pronunciation,...",Not the hacking and gagging and spitting part....
2,Not the hacking and gagging and spitting part....,Okay... then how 'bout we try out some French ...
3,You're asking me out. That's so cute. What's ...,Forget it.
4,"No, no, it's my fault -- we didn't have a prop...",Cameron.


In [None]:
df_convo

Unnamed: 0,input,response
0,Can we make this quick? Roxanne Korrine and A...,"Well, I thought we'd start with pronunciation,..."
1,"Well, I thought we'd start with pronunciation,...",Not the hacking and gagging and spitting part....
2,Not the hacking and gagging and spitting part....,Okay... then how 'bout we try out some French ...
3,You're asking me out. That's so cute. What's ...,Forget it.
4,"No, no, it's my fault -- we didn't have a prop...",Cameron.
...,...,...
221277,"Your orders, Mr Vereker?",I'm to take the Sikali with the main column to...
221278,I'm to take the Sikali with the main column to...,Lord Chelmsford seems to want me to stay back ...
221279,Lord Chelmsford seems to want me to stay back ...,I think Chelmsford wants a good man on the bor...
221280,"Well I assure you, Sir, I have no desire to cr...","And I assure you, you do not In fact I'd be ob..."


# Stanford Question Answering Dataset

In [None]:
import os
import json
import pandas as pd

In [None]:
sub_dir = 'Stanford_QA/train-v1.1.json'
full_file_path = os.path.join(main_dir, sub_dir)

# Load the data
with open(full_file_path, 'r') as file:
    squad_data = json.load(file)

# Inspect the structure of the data
print(squad_data.keys())

dict_keys(['data', 'version'])


In [None]:
# Extracting a sample paragraph and its Q&A
sample_paragraph = squad_data['data'][0]['paragraphs'][0]
context = sample_paragraph['context']
qas = sample_paragraph['qas']

print("Context:", context)
print("\nQ&A Sample:", qas[0])

Context: Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.

Q&A Sample: {'answers': [{'answer_start': 515, 'text': 'Saint Bernadette Soubirous'}], 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?', 'id': '5733be284776f41900661182'}


In [None]:
# Initialize lists to store the data
titles = []
contexts = []
questions = []
answers = []

for document in squad_data['data']:
    title = document['title']
    for paragraph in document['paragraphs']:
        context = paragraph['context']
        for qa in paragraph['qas']:
            question = qa['question']
            # For simplicity, store all answers as a list in the DataFrame
            ans = [a['text'] for a in qa['answers']]

            # Append to our lists
            titles.append(title)
            contexts.append(context)
            questions.append(question)
            answers.append(ans)

# Convert lists into a DataFrame
df = pd.DataFrame({
    'title': titles,
    'context': contexts,
    'question': questions,
    'answer': answers
})

# Check the DataFrame
df.head()


Unnamed: 0,title,context,question,answer
0,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,[Saint Bernadette Soubirous]
1,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,[a copper statue of Christ]
2,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,[the Main Building]
3,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,[a Marian place of prayer and reflection]
4,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,[a golden statue of the Virgin Mary]


In [None]:
df

Unnamed: 0,title,context,question,answer
0,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,[Saint Bernadette Soubirous]
1,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,[a copper statue of Christ]
2,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,[the Main Building]
3,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,[a Marian place of prayer and reflection]
4,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,[a golden statue of the Virgin Mary]
...,...,...,...,...
87594,Kathmandu,"Kathmandu Metropolitan City (KMC), in order to...",In what US state did Kathmandu first establish...,[Oregon]
87595,Kathmandu,"Kathmandu Metropolitan City (KMC), in order to...",What was Yangon previously known as?,[Rangoon]
87596,Kathmandu,"Kathmandu Metropolitan City (KMC), in order to...",With what Belorussian city does Kathmandu have...,[Minsk]
87597,Kathmandu,"Kathmandu Metropolitan City (KMC), in order to...",In what year did Kathmandu create its initial ...,[1975]


# Ubuntu Dialogue Corpus

In [None]:
import os
import pandas as pd

In [None]:
sub_dir = 'Ubuntu_Dialogue_Corpus/Ubuntu-dialogue-corpus/dialogueText_301.csv'
full_file_path = os.path.join(main_dir, sub_dir)

df_ubuntu = pd.read_csv(full_file_path)
df_ubuntu

Unnamed: 0,folder,dialogueID,date,from,to,text
0,301,1.tsv,2004-11-23T11:49:00.000Z,stuNNed,,any ideas why java plugin takes so long to load?
1,301,1.tsv,2004-11-23T11:49:00.000Z,crimsun,stuNNed,java 1.4?
2,301,1.tsv,2004-11-23T11:49:00.000Z,stuNNed,crimsun,yes
3,301,1.tsv,2004-11-23T11:49:00.000Z,crimsun,stuNNed,java 1.5 loads _much_ faster
4,301,1.tsv,2004-11-23T11:50:00.000Z,stuNNed,crimsun,noneus: how can i get 1.5 is there a .deb some...
...,...,...,...,...,...,...
16587825,32,1783.tsv,2007-11-15T03:38:00.000Z,koyo001,,thanks
16587826,32,1783.tsv,2007-11-15T03:39:00.000Z,koyo001,,does anyone know something
16587827,32,1783.tsv,2007-11-15T03:39:00.000Z,neverblue,,"no, no one knows everything"
16587828,32,1783.tsv,2007-11-15T03:40:00.000Z,koyo001,ikonia,the camera doesnt work
