<a href="https://colab.research.google.com/github/SRI-CSL/signal-public/blob/signal-demonstration/colabs/signal_api_part_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **SIGNAL**ing Interest Data

**Description:** Installation, basic APIs, and Dashboards, see [HERE](https://github.com/SRI-CSL/SIGNAL/blob/main/reports/milestone-7/signal-demo.org)

**Copyright 2022 SRI International.**

This project is under the GPL3 License. See the [LICENSE](https://www.gnu.org/licenses/gpl-3.0.en.html) file for the full license text.

## &#128640; Getting Started

Install the `SIGNAL API` client

In [None]:
!curl https://signal.cta.sri.com/client > client.tgz
!tar xzf client.tgz
!pip install -r signal_api_client/requirements.txt
!pip install -e signal_api_client
!pip install ipympl
%cd /content/signal_api_client   

Download the `funcs` utilities repository.

In [1]:
import os
import sys

In [2]:
!git clone https://github.com/hsanchez/funcs.git &> /dev/null

In [3]:
os.chdir(f'./funcs')
!git fetch

In [4]:
!git checkout activity_roles_detection_colab &> /dev/null

## &#9776; Dependencies

In [5]:
import time
import warnings

import re

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from typing import List, Any, Dict, Tuple
from datetime import date, datetime

In [6]:
import funcs as utils

In [7]:
from signal_api import signal

## &#9997; Configuration

In [8]:
warnings.filterwarnings("ignore")

In [9]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

In [10]:
np.random.seed(0)

In [11]:
from IPython import get_ipython
utils.common.set_default_vars(os.environ, ipython_val=get_ipython())

In [12]:
utils.console.stdout.print(utils.common.is_run_in_colab())

## &#128272; Login

In [13]:
signal.login()

username?: ··········
password?: ··········


True

## &#128722; Data

### &#9759; Tables

In [20]:
def get_record_count(table_name: str) -> int:
    query = f"SELECT COUNT(*) FROM {table_name};"
    df_result = signal.query_dataframe(query)
    result = df_result['count'].iloc[0]
    return result

In [41]:
def get_table_columns(table_name: str) -> List[str]:
    query = f"SELECT column_name FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = N'{table_name}';"
    df_result = signal.query_dataframe(query)
    result_list = df_result.column_name.tolist()
    return result_list

In [51]:
def print_table_info(table_name: str) -> None:
    total_records = get_record_count(table_name=table_name)
    table_columns = get_table_columns(table_name=table_name)

    print(f"- In total, there are {total_records:,} records in the {table_name} table.\n"\
          f"-- Table {table_name} contains {len(table_columns)} columns, namely: {table_columns}.\n")

In [14]:
TABLES_QUERY = "SELECT * FROM information_schema.tables WHERE table_type='BASE TABLE';"

In [15]:
df_tables = signal.query_dataframe(TABLES_QUERY)

In [16]:
table_names = df_tables.table_name.unique()

In [17]:
print(f"There are {len(table_names)} tables currently present in the SIGNAL database.")

There are 87 tables currently present in the SIGNAL database.


In [18]:
df_tables.head()

Unnamed: 0,table_catalog,table_schema,table_name,table_type,self_referencing_column_name,reference_generation,user_defined_type_catalog,user_defined_type_schema,user_defined_type_name,is_insertable_into,is_typed,commit_action
0,signal,public,scraped_projects,BASE TABLE,,,,,,YES,NO,
1,signal,public,scraped_patch_series,BASE TABLE,,,,,,YES,NO,
2,signal,public,diff,BASE TABLE,,,,,,YES,NO,
3,signal,public,thread,BASE TABLE,,,,,,YES,NO,
4,signal,public,git_files,BASE TABLE,,,,,,YES,NO,


In [19]:
table_names

array(['scraped_projects', 'scraped_patch_series', 'diff', 'thread',
       'git_files', 'email', 'git_commit_edges', 'pg_statistic',
       'pg_type', 'git_file_changes', 'scraped_patches',
       'alembic_version', 'api_key', 'pg_foreign_table', 'pg_authid',
       'signal_history', 'pg_statistic_ext_data', 'git_repos',
       'user_identities', 'person', 'scraped_patch_submitters',
       'mailing_list', 'git_sigs', 'pg_user_mapping', 'pg_subscription',
       'pg_attribute', 'pg_proc', 'pg_class', 'pg_attrdef',
       'pg_constraint', 'pg_inherits', 'pg_index', 'pg_operator',
       'pg_opfamily', 'pg_opclass', 'pg_am', 'pg_amop', 'pg_amproc',
       'pg_language', 'pg_largeobject_metadata', 'pg_aggregate',
       'pg_statistic_ext', 'pg_rewrite', 'pg_trigger', 'pg_event_trigger',
       'pg_description', 'pg_cast', 'pg_enum', 'pg_namespace',
       'pg_conversion', 'pg_depend', 'pg_database', 'pg_db_role_setting',
       'pg_tablespace', 'pg_auth_members', 'pg_shdepend',
       'p

In [53]:
print_table_info(table_name='email')
print_table_info(table_name='scraped_projects')
print_table_info(table_name='scraped_patch_series')
print_table_info(table_name='diff')
print_table_info(table_name='thread')
print_table_info(table_name='git_files')
print_table_info(table_name='git_commit_edges')
print_table_info(table_name='git_file_changes')
print_table_info(table_name='scraped_patches')
print_table_info(table_name='git_repos')
print_table_info(table_name='user_identities')
print_table_info(table_name='person')
print_table_info(table_name='scraped_patch_submitters')
print_table_info(table_name='mailing_list')
print_table_info(table_name='git_sigs')
print_table_info(table_name='patchwork_person_id')
print_table_info(table_name='git_commits')
print_table_info(table_name='git_xrefs')

- In total, there are 828,219 records in the email table.
-- Table email contains 15 columns, namely: ['id', 'mailing_list_id', 'author_id', 'timestamp_sent', 'timestamp_recv', 'reply_to_url', 'thread_id', 'persuasion', 'reply_to_message_id', 'subject', 'body', 'url', 'clean_body', 'email_id', 'message_id'].

- In total, there are 93 records in the scraped_projects table.
-- Table scraped_projects contains 2 columns, namely: ['id', 'name'].

- In total, there are 280,722 records in the scraped_patch_series table.
-- Table scraped_patch_series contains 3 columns, namely: ['id', 'patchwork_id', 'name'].

- In total, there are 667,972 records in the diff table.
-- Table diff contains 4 columns, namely: ['id', 'email_id', 'file', 'contents'].

- In total, there are 10,387 records in the thread table.
-- Table thread contains 20 columns, namely: ['id', 'deg_max', 'deg_max_2', 'deg_max_3', 'deg_max_4', 'deg_max_5', 'patch', 'emails', 'users', 'start', 'days', 'depth', 'star_nodes', 'h_index'

In [36]:
print(e)

['id', 'mailing_list_id', 'author_id', 'timestamp_sent', 'timestamp_recv', 'reply_to_url', 'thread_id', 'persuasion', 'reply_to_message_id', 'subject', 'body', 'url', 'clean_body', 'email_id', 'message_id']


## &#129504; Plots