In [None]:
import sys
path_to_scripts = '../scripts'
sys.path.append(path_to_scripts)
import os
from pathlib import Path
from tqdm import tqdm
import pandas as pd
import json
import argparse
from colorama import Fore, Style, init
import shutil
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from transformers import AutoConfig, AutoModel, AutoTokenizer

root_dir = "/work3/s174159/LLM_Thesis"
data_dir = Path(root_dir + "/data")
processed_data_dir = Path(data_dir, "combined_dataset")

model_name = "bert-base-cased"
max_seq_length = 1024


In [None]:
combined_df = pd.read_json(Path(data_dir, "combined_dataset","combined_dataset.json"), orient="records", lines=False)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming combined_df is already defined
sns.countplot(x='org_dataset', data=combined_df)
plt.xlabel('Dataset')
plt.ylabel('Count')
plt.title('Distribution of Datasets in Combined Dataset')
plt.show()


In [None]:
import ipywidgets as widgets
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import pandas as pd

# Create a dropdown widget for dataset selection
dataset_dropdown = widgets.Dropdown(
    options=['combined'] + combined_df['org_dataset'].unique().tolist(),
    value='combined',
    description='Dataset:',
    disabled=False,
)

# Create a slider widget for selecting the number of visible relations
relation_slider = widgets.IntSlider(
    value=10,
    min=1,
    max=50,
    step=1,
    description='Relations:',
    disabled=False,
)

# Create a function to update the plot based on the selected dataset and number of relations
def update_plot(dataset, num_relations):
    if dataset == 'combined':
        selected_df = combined_df
    else:
        selected_df = combined_df[combined_df['org_dataset'] == dataset]

    vertexSet = []
    relations = []

    for index, row in tqdm(selected_df.iterrows(), total=selected_df.shape[0], desc='Processing data'):
        for ver in row['vertexSet']:
            for v in ver:
                vertexSet.append(v['type'])
        for rel in row['labels']:
            relations.append(rel['r'])
    
    # Get the counts of each relation
    relation_counts = pd.Series(relations).value_counts()
    
    # Select the top 'num_relations' relations to display
    top_relations = relation_counts.head(num_relations).reset_index()
    top_relations.columns = ['Relation', 'Count']

    plt.figure(figsize=(12, 6))
    sns.barplot(x='Relation', y='Count', data=top_relations)
    plt.xlabel('Relation')
    plt.ylabel('Count')
    plt.title('Distribution of Relations in {}'.format(dataset))
    plt.xticks(rotation=90)
    plt.show()

# Create an interactive plot with dataset dropdown and relation slider
interactive_plot = widgets.interactive(update_plot, dataset=dataset_dropdown, num_relations=relation_slider)

# Display the widgets and the plot
display(interactive_plot)


In [None]:
import ipywidgets as widgets
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import pandas as pd

# Create a dropdown widget for dataset selection
dataset_dropdown = widgets.Dropdown(
    options=['combined'] + combined_df['org_dataset'].unique().tolist(),
    value='combined',
    description='Dataset:',
    disabled=False,
)

# Create a slider widget for selecting the number of visible relations
relation_slider = widgets.IntSlider(
    value=10,
    min=1,
    max=50,
    step=1,
    description='Entities:',
    disabled=False,
)

# Create a function to update the plot based on the selected dataset and number of entities
def update_plot(dataset, num_entities):
    if dataset == 'combined':
        selected_df = combined_df
    else:
        selected_df = combined_df[combined_df['org_dataset'] == dataset]

    vertexSet = []
    relations = []

    for index, row in tqdm(selected_df.iterrows(), total=selected_df.shape[0], desc='Processing data'):
        for ver in row['vertexSet']:
            for v in ver:
                vertexSet.append(v['type'])
        for rel in row['labels']:
            relations.append(rel['r'])
    
    # Get the counts of each entity
    entities_counts = pd.Series(vertexSet).value_counts()
    
    # Select the top 'num_entities' entities to display
    top_entities = entities_counts.head(num_entities).reset_index()
    top_entities.columns = ['Entity', 'Count']

    plt.figure(figsize=(12, 6))
    sns.barplot(x='Entity', y='Count', data=top_entities)
    plt.xlabel('Entities')
    plt.ylabel('Count')
    plt.title('Distribution of Entities in {}'.format(dataset))
    plt.xticks(rotation=90)
    plt.show()

# Create an interactive plot with dataset dropdown and entity slider
interactive_plot = widgets.interactive(update_plot, dataset=dataset_dropdown, num_entities=relation_slider)

# Display the widgets and the plot
display(interactive_plot)


In [None]:
import ipywidgets as widgets
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import pandas as pd

# Create a dropdown widget for dataset selection
dataset_dropdown = widgets.Dropdown(
    options=['combined'] + combined_df['org_dataset'].unique().tolist(),
    value='combined',
    description='Dataset:',
    disabled=False,
)

def update_plot(dataset):
    if dataset == 'combined':
        selected_df = combined_df
    else:
        selected_df = combined_df[combined_df['org_dataset'] == dataset]

    vertexSet = []
    relations = []

    for index, row in tqdm(selected_df.iterrows(), total=selected_df.shape[0], desc='Processing data'):
        for ver in row['vertexSet']:
            for v in ver:
                vertexSet.append(v['type'])
        for rel in row['labels']:
            relations.append(rel['r'])
    # Get the counts of each relation
    entities_counts = pd.Series(vertexSet).value_counts()

    # Select the top 10 entities to display
    top_entities = entities_counts.head(10).reset_index()
    top_entities.columns = ['Entity', 'Count']

    plt.figure(figsize=(12, 6))
    sns.barplot(x='Entity', y='Count', data=top_entities)
    plt.xlabel('Entities')
    plt.ylabel('Count')
    plt.title('Top 10 Entities in {}'.format(dataset))
    plt.xticks(rotation=90)
    plt.show()

# Create an interactive plot with dataset dropdown
interactive_plot = widgets.interactive(update_plot, dataset=dataset_dropdown)

# Display the widgets and the plot
display(interactive_plot)


In [None]:
import ipywidgets as widgets
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import pandas as pd

# Create a dropdown widget for dataset selection
dataset_dropdown = widgets.Dropdown(
    options=['combined'] + combined_df['org_dataset'].unique().tolist(),
    value='combined',
    description='Dataset:',
    disabled=False,
)

def update_plot(dataset):
    if dataset == 'combined':
        selected_df = combined_df
    else:
        selected_df = combined_df[combined_df['org_dataset'] == dataset]

    vertexSet = []
    relations = []

    for index, row in tqdm(selected_df.iterrows(), total=selected_df.shape[0], desc='Processing data'):
        for ver in row['vertexSet']:
            for v in ver:
                vertexSet.append(v['type'])
        for rel in row['labels']:
            relations.append(rel['r'])
    
    # Get the counts of each relation
    relations_count = pd.Series(relations).value_counts()

    # Select the top 10 relations to display
    top_relations = relations_count.head(10).reset_index()
    top_relations.columns = ['Relation', 'Count']

    plt.figure(figsize=(12, 6))
    sns.barplot(x='Relation', y='Count', data=top_relations)
    plt.xlabel('Relations')
    plt.ylabel('Count')
    plt.title('Top 10 Relations in {}'.format(dataset))
    plt.xticks(rotation=90)
    plt.show()

# Create an interactive plot with dataset dropdown
interactive_plot = widgets.interactive(update_plot, dataset=dataset_dropdown)

# Display the widgets and the plot
display(interactive_plot)


In [None]:
import ipywidgets as widgets
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import pandas as pd

# Create a dropdown widget for dataset selection
dataset_dropdown = widgets.Dropdown(
    options=['combined'] + combined_df['org_dataset'].unique().tolist(),
    value='combined',
    description='Dataset:',
    disabled=False,
)

# Create a slider widget for selecting the number of visible relations
relation_slider = widgets.IntSlider(
    value=10,
    min=1,
    max=50,
    step=1,
    description='Relations:',
    disabled=False,
)

# Create a function to update the plot based on the selected dataset and number of relations
def update_plot(dataset, num_relations):
    if dataset == 'combined':
        selected_df = combined_df
    else:
        selected_df = combined_df[combined_df['org_dataset'] == dataset]

    entity_relations = {}

    for index, row in tqdm(selected_df.iterrows(), total=selected_df.shape[0], desc='Processing data'):
        for ver in row['vertexSet']:
            for v in ver:
                entity_type = v['type']
                if entity_type not in entity_relations:
                    entity_relations[entity_type] = 0
                # Count the number of relations involving this entity type
                entity_relations[entity_type] += len(row['labels'])

    # Convert the dictionary to a Pandas Series for easy plotting
    entity_relations_series = pd.Series(entity_relations)
    
    # Select the top 'num_relations' entity types to display
    top_entity_relations = entity_relations_series.sort_values(ascending=False).head(num_relations).reset_index()
    top_entity_relations.columns = ['Entity Type', 'Count of Relations']

    plt.figure(figsize=(12, 6))
    sns.barplot(x='Entity Type', y='Count of Relations', data=top_entity_relations)
    plt.xlabel('Entity Type')
    plt.ylabel('Count of Relations')
    plt.title('Number of Relations for Each Entity Type in {}'.format(dataset))
    plt.xticks(rotation=90)
    plt.show()

# Create an interactive plot with dataset dropdown and relation slider
interactive_plot = widgets.interactive(update_plot, dataset=dataset_dropdown, num_relations=relation_slider)

# Display the widgets and the plot
display(interactive_plot)


In [None]:
import seaborn as sns

for dataset in combined_df['org_dataset'].unique():
    dataset_df = combined_df[combined_df['org_dataset'] == dataset]
    dataset_df['Sentence Length'] = dataset_df['sents'].apply(lambda x: len(x))
    plt.figure(figsize=(12, 6))
    sns.histplot(dataset_df['Sentence Length'], bins=50, kde=True)
    plt.xlabel('Sentence Length')
    plt.ylabel('Count')
    plt.title('Distribution of Sentence Lengths in {}'.format(dataset))
    plt.show()



In [None]:
import seaborn as sns

for dataset in tqdm(combined_df['org_dataset'].unique(), total = len(combined_df['org_dataset'].unique()), desc='Processing datasets'):
    sent_length = []
    dataset_df = combined_df[combined_df['org_dataset'] == dataset]
    for sent in dataset_df['sents']:
        for s in sent:
            sent_length.append(len(s))
    plt.figure(figsize=(12, 6))
    sns.histplot(sent_length, bins=50, kde=True)
    plt.xlabel('Sentence Length')
    plt.ylabel('Count')
    plt.title('Distribution of Sentence Lengths in {}'.format(dataset))
    plt.show()

