<a href="https://colab.research.google.com/github/SRI-CSL/signal-public/blob/signal-demonstration/colabs/signal_api_part_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **SIGNAL**ing Interest Data

**Description:** Installation, basic APIs, and Dashboards, see [HERE](https://github.com/SRI-CSL/SIGNAL/blob/main/reports/milestone-7/signal-demo.org)

**Copyright 2022 SRI International.**

This project is under the GPL3 License. See the [LICENSE](https://www.gnu.org/licenses/gpl-3.0.en.html) file for the full license text.

## &#128640; Getting Started

Install the `SIGNAL API` client

In [None]:
!curl https://signal.cta.sri.com/client > client.tgz
!tar xzf client.tgz
!pip install -r signal_api_client/requirements.txt
!pip install -e signal_api_client
!pip install ipympl
%cd /content/signal_api_client   

Download the `funcs` utilities repository.

In [1]:
import os
import sys

In [2]:
!git clone https://github.com/hsanchez/funcs.git &> /dev/null

In [3]:
os.chdir(f'./funcs')
!git fetch

In [4]:
!git checkout activity_roles_detection_colab &> /dev/null

## &#9776; Dependencies

In [5]:
import time
import warnings

import re

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from typing import List, Any, Dict, Tuple
from datetime import date, datetime

In [6]:
import funcs as utils

In [7]:
from signal_api import signal

## &#9997; Configuration

In [8]:
warnings.filterwarnings("ignore")

In [9]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

In [10]:
np.random.seed(0)

In [62]:
pd.options.plotting.backend = "plotly"

In [11]:
from IPython import get_ipython
utils.common.set_default_vars(os.environ, ipython_val=get_ipython())

In [12]:
utils.console.stdout.print(utils.common.is_run_in_colab())

## &#128272; Login

In [13]:
signal.login()

username?: ··········
password?: ··········


True

## &#128722; Data

### &#9759; Tables

In [59]:
def df_groupby(input_df: pd.DataFrame, groupby_column: str, sort_values: bool=True) -> pd.Series:
    if sort_values:
        tmp_data = input_df.groupby(input_df[groupby_column]).size().sort_values()
    else:
        tmp_data = input_df.groupby(input_df[groupby_column]).size()
    
    return tmp_data

In [20]:
def get_record_count(table_name: str) -> int:
    query = f"SELECT COUNT(*) FROM {table_name};"
    df_result = signal.query_dataframe(query)
    result = df_result['count'].iloc[0]
    return result

In [41]:
def get_table_columns(table_name: str) -> List[str]:
    query = f"SELECT column_name FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = N'{table_name}';"
    df_result = signal.query_dataframe(query)
    result_list = df_result.column_name.tolist()
    return result_list

In [51]:
def print_table_info(table_name: str) -> None:
    total_records = get_record_count(table_name=table_name)
    table_columns = get_table_columns(table_name=table_name)

    print(f"- In total, there are {total_records:,} records in the {table_name} table.\n"\
          f"-- Table {table_name} contains {len(table_columns)} columns, namely: {table_columns}.\n")

In [54]:
def get_db_records(query: str) -> pd.DataFrame:
    df_result = signal.query_dataframe(query)

    return df_result

In [14]:
TABLES_QUERY = "SELECT * FROM information_schema.tables WHERE table_type='BASE TABLE';"

In [15]:
df_tables = signal.query_dataframe(TABLES_QUERY)

In [16]:
table_names = df_tables.table_name.unique()

In [17]:
print(f"There are {len(table_names)} tables currently present in the SIGNAL database.")

There are 87 tables currently present in the SIGNAL database.


In [18]:
df_tables.head()

Unnamed: 0,table_catalog,table_schema,table_name,table_type,self_referencing_column_name,reference_generation,user_defined_type_catalog,user_defined_type_schema,user_defined_type_name,is_insertable_into,is_typed,commit_action
0,signal,public,scraped_projects,BASE TABLE,,,,,,YES,NO,
1,signal,public,scraped_patch_series,BASE TABLE,,,,,,YES,NO,
2,signal,public,diff,BASE TABLE,,,,,,YES,NO,
3,signal,public,thread,BASE TABLE,,,,,,YES,NO,
4,signal,public,git_files,BASE TABLE,,,,,,YES,NO,


In [19]:
table_names

array(['scraped_projects', 'scraped_patch_series', 'diff', 'thread',
       'git_files', 'email', 'git_commit_edges', 'pg_statistic',
       'pg_type', 'git_file_changes', 'scraped_patches',
       'alembic_version', 'api_key', 'pg_foreign_table', 'pg_authid',
       'signal_history', 'pg_statistic_ext_data', 'git_repos',
       'user_identities', 'person', 'scraped_patch_submitters',
       'mailing_list', 'git_sigs', 'pg_user_mapping', 'pg_subscription',
       'pg_attribute', 'pg_proc', 'pg_class', 'pg_attrdef',
       'pg_constraint', 'pg_inherits', 'pg_index', 'pg_operator',
       'pg_opfamily', 'pg_opclass', 'pg_am', 'pg_amop', 'pg_amproc',
       'pg_language', 'pg_largeobject_metadata', 'pg_aggregate',
       'pg_statistic_ext', 'pg_rewrite', 'pg_trigger', 'pg_event_trigger',
       'pg_description', 'pg_cast', 'pg_enum', 'pg_namespace',
       'pg_conversion', 'pg_depend', 'pg_database', 'pg_db_role_setting',
       'pg_tablespace', 'pg_auth_members', 'pg_shdepend',
       'p

In [53]:
print_table_info(table_name='email')
print_table_info(table_name='scraped_projects')
print_table_info(table_name='scraped_patch_series')
print_table_info(table_name='diff')
print_table_info(table_name='thread')
print_table_info(table_name='git_files')
print_table_info(table_name='git_commit_edges')
print_table_info(table_name='git_file_changes')
print_table_info(table_name='scraped_patches')
print_table_info(table_name='git_repos')
print_table_info(table_name='user_identities')
print_table_info(table_name='person')
print_table_info(table_name='scraped_patch_submitters')
print_table_info(table_name='mailing_list')
print_table_info(table_name='git_sigs')
print_table_info(table_name='patchwork_person_id')
print_table_info(table_name='git_commits')
print_table_info(table_name='git_xrefs')

- In total, there are 828,219 records in the email table.
-- Table email contains 15 columns, namely: ['id', 'mailing_list_id', 'author_id', 'timestamp_sent', 'timestamp_recv', 'reply_to_url', 'thread_id', 'persuasion', 'reply_to_message_id', 'subject', 'body', 'url', 'clean_body', 'email_id', 'message_id'].

- In total, there are 93 records in the scraped_projects table.
-- Table scraped_projects contains 2 columns, namely: ['id', 'name'].

- In total, there are 280,722 records in the scraped_patch_series table.
-- Table scraped_patch_series contains 3 columns, namely: ['id', 'patchwork_id', 'name'].

- In total, there are 667,972 records in the diff table.
-- Table diff contains 4 columns, namely: ['id', 'email_id', 'file', 'contents'].

- In total, there are 10,387 records in the thread table.
-- Table thread contains 20 columns, namely: ['id', 'deg_max', 'deg_max_2', 'deg_max_3', 'deg_max_4', 'deg_max_5', 'patch', 'emails', 'users', 'start', 'days', 'depth', 'star_nodes', 'h_index'

## &#129504; Plots

### &#9759; person Table

In [55]:
# person table
df_person = get_db_records("SELECT * FROM person;")

In [56]:
df_person.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21345 entries, 0 to 21344
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             21345 non-null  int64 
 1   name           21345 non-null  object
 2   email_address  21345 non-null  object
 3   is_bot         21345 non-null  int64 
 4   cluster_id     0 non-null      object
dtypes: int64(2), object(3)
memory usage: 833.9+ KB


In [57]:
df_person.head()

Unnamed: 0,id,name,email_address,is_bot,cluster_id
0,1,Joel Fernandes,joelaf@NO-ID-FOUND.mhonarc.org,0,
1,2,richard clark,richard.xnu.clark@mail.gmail.com,0,
2,3,Takashi Iwai,tiwai@suse.de,0,
3,4,Lu Baolu,baolu.lu@linux.intel.com,0,
4,5,Chris Wilson,chris@build.alporthouse.com,0,


In [61]:
tmp_person_is_bot = df_groupby(input_df=df_person, groupby_column='is_bot')
tmp_person_is_bot

is_bot
1     3656
0    17689
dtype: int64

In [75]:
tmp_person_is_bot.plot.bar(width=750)

### &#9759; scraped_patches Table

In [76]:
df_scraped_patches = get_db_records("SELECT * FROM scraped_patches;")

In [77]:
df_scraped_patches.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 938137 entries, 0 to 938136
Data columns (total 17 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   id             938137 non-null  int64  
 1   project_id     938137 non-null  int64  
 2   patch_id       938137 non-null  int64  
 3   mail_id        938137 non-null  object 
 4   name           938137 non-null  object 
 5   series_id      935162 non-null  float64
 6   ack_count      938137 non-null  int64  
 7   review_count   938137 non-null  int64  
 8   tested_count   938137 non-null  int64  
 9   success_count  938137 non-null  int64  
 11  fail_count     938137 non-null  int64  
 12  date           938137 non-null  object 
 13  submitter_id   938137 non-null  int64  
 14  delegate       125878 non-null  object 
 15  state          938137 non-null  object 
 16  commit_hash    48390 non-null   object 
dtypes: float64(1), int64(10), object(6)
memory usage: 121.7+ MB


In [78]:
df_scraped_patches.head()

Unnamed: 0,id,project_id,patch_id,mail_id,name,series_id,ack_count,review_count,tested_count,success_count,warning_count,fail_count,date,submitter_id,delegate,state,commit_hash
0,3488,59,13000777,20221006220840.275-4-jonathan.derrick@linux.dev,[2/2] md/bitmap: Add chunk-count-based bitmap ...,1446.0,0,0,0,0,0,0,2022-10-06T00:00:00,522,,Superseded,
1,3489,56,13034662,20221107155825.1644604-12-pierre.gondois@arm.com,"[v2,11/23] arm64: dts: Update cache properties...",1203.0,0,1,0,0,0,0,2022-11-07T00:00:00,411,geert,New,
2,3490,56,13034661,20221107155825.1644604-11-pierre.gondois@arm.com,"[v2,10/23] arm64: dts: Update cache properties...",1203.0,0,0,0,0,0,0,2022-11-07T00:00:00,411,geert,New,
3,3491,57,13004984,20221012114429.2341215-6-danishanwar@ti.com,"[v6,5/5] remoteproc: pru: Configure firmware b...",1447.0,0,0,0,0,0,0,2022-10-12T00:00:00,396,,Superseded,
4,3492,57,13004985,20221012114429.2341215-5-danishanwar@ti.com,"[v6,4/5] remoteproc: pru: Add pru_rproc_set_ct...",1447.0,0,0,0,0,0,0,2022-10-12T00:00:00,396,,Superseded,


In [79]:
tmp_sp_project_id = df_groupby(input_df=df_scraped_patches, groupby_column='project_id')
tmp_sp_project_id

project_id
63         7
16         8
77        19
29        41
17        56
       ...  
52     39650
87     49635
31     70477
76     95940
58    115437
Length: 93, dtype: int64

In [85]:
tmp_sp_project_id.plot.bar(log_y=True)

In [81]:
tmp_sp_state = df_groupby(input_df=df_scraped_patches, groupby_column='state')
tmp_sp_state

state
Needs ACK                48
In Next                 124
Under Review            744
Queued                  941
Rejected               2136
Deferred               4252
RFC                    6300
Awaiting Upstream      7308
Handled Elsewhere      7809
Mainlined             14739
Changes Requested     26867
Not Applicable        49217
Accepted              91480
Superseded           135830
New                  590342
dtype: int64

In [84]:
tmp_sp_state.plot.bar(log_y=True)