In [21]:
import pandas as pd
import numpy as np
from datetime import datetime

from comparison_utils import (compare_filters)
from mining_utils import (grab_facility_mentions,
                          generate_facility_names)
from apriori_utils import (apriori_from_df)

In [3]:
facility_data = generate_facility_names("./source/all_facilities.csv")

filter = "Completed Task List Activities:"

In [4]:
facility_data

{'facility_name_abbr': {'Advanced Biological Research System': 'ABRS',
  'Autonomous Biological System': 'ABS',
  'Acoustic Diagnostics Facility': 'Acoustic Diagnostics Facility',
  'Actiwatch': 'Actiwatch',
  'Actiwatch Spectrum System': 'Actiwatch Spectrum',
  'Avian Development Facility': 'ADF',
  'ADvanced Space Experiment Processor': 'ADSEP',
  'ADvanced Space Experiment Processor-2': 'ADSEP-2',
  'Animal Enclosure Module': 'AEM',
  "Anomalous Long Term Effects in Astronauts' Central Nervous System Facility": 'ALTEA Facility',
  'Additive Manufacturing Facility': 'Manufacturing Device',
  'Advanced Protein Crystallization Facility': 'APCF',
  'Aquatic Habitat': 'Aquatic Habitat',
  'ARCTIC Refrigerator and Freezer': 'ARCTIC',
  'Advanced Resistive Exercise Device': 'ARED',
  'Astro Garden': 'Astro Garden',
  'Astrobee': 'Astrobee',
  'Bartolomeo': 'Bartolomeo',
  'Bio-Analyzer': 'Bio-Analyzer',
  'BioChip SpaceLab': 'BioChip SpaceLab',
  'Bioculture System': 'Bioculture System Fac

In [6]:
facility_mentions = grab_facility_mentions("./reports", facility_data)

df = pd.DataFrame.from_dict(facility_mentions).T
df.index = pd.to_datetime(df.index)
df = df.sort_index(ascending=True)

100%|██████████| 4026/4026 [00:10<00:00, 379.47it/s]


In [7]:
facility_mentions_filter = grab_facility_mentions("./reports", facility_data, filter)

df_filter = pd.DataFrame.from_dict(facility_mentions_filter).T
df_filter.index = pd.to_datetime(df_filter.index)
df_filter = df_filter.sort_index(ascending=True)

100%|██████████| 4026/4026 [00:10<00:00, 397.12it/s]


In [8]:
df_diff = df - df_filter

In [26]:
list_mention_dates = {col: [] for col in df_diff.columns}

for col in df_diff:
    list_mention_dates[col] = list(df_diff[df_diff[col] == 1][col].index.strftime('%Y-%m-%d'))

In [27]:
list_mention_dates

{'ABRS': [],
 'ABS': [],
 'Acoustic Diagnostics Facility': [],
 'Actiwatch': [],
 'Actiwatch Spectrum': [],
 'ADF': [],
 'ADSEP': [],
 'ADSEP-2': [],
 'AEM': ['2019-06-10'],
 'ALTEA Facility': [],
 'Manufacturing Device': [],
 'APCF': [],
 'Aquatic Habitat': [],
 'ARCTIC': [],
 'ARED': ['2018-08-21',
  '2018-08-22',
  '2018-09-05',
  '2018-09-19',
  '2018-10-03',
  '2018-10-18',
  '2018-10-24',
  '2018-10-25',
  '2018-10-30',
  '2018-11-01',
  '2018-11-13',
  '2018-11-21',
  '2018-11-26',
  '2018-12-05',
  '2019-01-10',
  '2019-02-04',
  '2019-02-20',
  '2019-03-05',
  '2019-03-08',
  '2019-03-12',
  '2019-03-18',
  '2019-03-19',
  '2019-03-20',
  '2019-03-27',
  '2019-03-28',
  '2019-04-03',
  '2019-04-09',
  '2019-04-15',
  '2019-04-18',
  '2019-04-24',
  '2019-04-29',
  '2019-04-30',
  '2019-05-13',
  '2019-05-14',
  '2019-05-22',
  '2019-05-24',
  '2019-05-28',
  '2019-06-05',
  '2019-06-12',
  '2019-06-20',
  '2019-07-19',
  '2019-08-13',
  '2019-08-27',
  '2019-09-30',
  '2019-10

In [9]:
df_diff[df_diff == 1].fillna(0, inplace=True)
list_mentions = df_diff.sum()
df_diff[list_mentions[list_mentions > 0].index].sum().sort_values(ascending=False)

MELFI                         188
MWA                            91
ARED                           66
Ultrasound                     61
Spectrum                       36
                             ... 
GLACIER                         1
ETC                             1
Bio-Monitor                     1
Bioculture System Facility      1
Iceberg                         1
Length: 65, dtype: int64

In [9]:
mentions_apriori = apriori_from_df(df)

mentions2_apriori = apriori_from_df(df_filter)

      support                                     itemsets  length
81   0.307253                                 (TVIS, ARED)       2
56   0.262047                                (ARED, CEVIS)       2
138  0.234228                                (TVIS, CEVIS)       2
71   0.172876                                (MELFI, ARED)       2
130  0.126428                               (MELFI, CEVIS)       2
..        ...                                          ...     ...
225  0.030303                                   (FSL, MSG)       2
191  0.030303                                  (EMCS, MSG)       2
132  0.030055                                 (MWA, CEVIS)       2
178  0.030055                                 (MELFI, EDR)       2
134  0.030055  (Portable Pulmonary Function System, CEVIS)       2

[233 rows x 3 columns]
      support                                     itemsets  length
80   0.307253                                 (TVIS, ARED)       2
55   0.257824                         

In [36]:
for val in [list(val) for val in mentions_apriori["itemsets"].values]:
    if val not in [list(val) for val in mentions2_apriori["itemsets"].values]:
        print(f"Itemset not found: {val}")
    else:
        original_support = mentions_apriori[mentions_apriori["itemsets"] == frozenset(val)]['support'].values[0]
        filter_support = mentions2_apriori[mentions2_apriori["itemsets"] == frozenset(val)]['support'].values[0]

        print(f"Diff of {val}: {original_support - filter_support}")

Diff of ['TVIS', 'ARED']: 0.0
Diff of ['ARED', 'CEVIS']: 0.004222553402881302
Diff of ['TVIS', 'CEVIS']: 0.0
Diff of ['MELFI', 'ARED']: 0.009190263288623962
Diff of ['MELFI', 'CEVIS']: 0.00596125186289119
Diff of ['MELFI', 'TVIS']: 0.0
Diff of ['CIR', 'ARED']: 0.002732240437158473
Diff of ['ARED', 'Holter']: 0.0004967709885742577
Diff of ['ARED', 'Actiwatch']: 0.0
Diff of ['TVIS', 'Holter']: 0.0
Diff of ['CIR', 'TVIS']: 0.0
Diff of ['ARED', 'EPM']: 0.00024838549428712886
Diff of ['TVIS', 'Actiwatch']: 0.0
Diff of ['ARED', 'MSG']: 0.002980625931445602
Diff of ['ARED', 'Ultrasound']: 0.0032290114257327307
Diff of ['RC', 'MELFI']: 0.0
Diff of ['TVIS', 'EPM']: 0.0
Diff of ['ARED', 'CGBA']: 0.0
Diff of ['Holter', 'CEVIS']: 0.00024838549428712886
Diff of ['Actiwatch', 'CEVIS']: 0.00024838549428712886
Diff of ['TVIS', 'CGBA']: 0.0
Diff of ['CIR', 'CEVIS']: 0.001490312965722801
Diff of ['MELFI', 'Ultrasound']: 0.01068057625434675
Diff of ['TVIS', 'Ultrasound']: 0.0
Diff of ['TVIS', 'MSG']: 0.0