In [1]:
# !pip install --upgrade pandas
# !pip install --upgrade networkx
# !pip install pyinform

In [2]:

""" Include containing folder for testing """
import sys, os
import datetime

sys.path.insert(0, os.path.abspath('/home/ec2-user/SageMaker/ing/src'))
print(os.path.abspath('.'))
# ---------------------------------------

""" 
Test in local pycharm env:
    Uncomment the lines below to test in local machine.
    Otherwise keep them commented 
"""
# from ing.src import ing
# from ing.src import S3Access
# -----------------------------


""" 
Test in aws:
    Uncomment the lines below to test on aws sagemaker. 
    Otherwise keep them commented.
"""
import ing
# -----------------------------

# s3 specific libraries
import s3fs
import boto3
s3 = s3fs.S3FileSystem(anon=False)


"""
Usual module/package imports go below here
"""
import pandas as pd
# -----------------------------


/home/ec2-user/SageMaker/ing/test


In [3]:
DATA_DIRECTORY_LIST = ["s3://mips-main/initial_data_collection/raw_data/brandwatch/", "s3://mips-main/initial_data_collection/TE_ready_data/V2/"]
NEWS_DOMAIN_TO_CLASS_FILE = "s3://mips-main-2/UFTM_classification-v2/news_table-v3-UT60.csv"
START_DATE = datetime.datetime(2018, 3, 1, tzinfo=datetime.timezone.utc)
END_DATE = datetime.datetime(2018, 5, 2, tzinfo=datetime.timezone.utc)
FREQUENCY = '6H'
MIN_PLAT_SIZE = 500
MIN_ACTIVITY_PER_MONTH = 30


In [4]:

# Read input files
any_source_reader = ing.AnyDataSourceReader()
# paths = any_source_reader.get_file_paths_list(DATA_DIRECTORY)

paths = []
for data_dir in DATA_DIRECTORY_LIST:
    paths += s3.glob(os.path.join(data_dir, "*.csv*"))
paths = [f"s3://{p}" for p in paths]
print(paths)
all_osn_msgs_df = any_source_reader.read_files_list(paths)
print(all_osn_msgs_df)

news_domain_classes_df = pd.read_csv(NEWS_DOMAIN_TO_CLASS_FILE,
                                     usecols=['Domain', 'tufm_class', 'Language'])
news_domain_classes_df.rename(columns={'Domain': 'news_domain', 'tufm_class': 'class', 'Language': 'lang'},
                              inplace=True)


['s3://mips-main/initial_data_collection/raw_data/brandwatch/2018_03_01_to_2018_03_06_withFb_2033735572_MainQuery.csv.zip', 's3://mips-main/initial_data_collection/raw_data/brandwatch/2018_03_07_to_2018_03_07_withoutFb_2033735852_MIPs+test.csv.zip', 's3://mips-main/initial_data_collection/raw_data/brandwatch/2018_03_07_to_2020_05_02_2033753991_MainQuery_FbIgOnly.csv.zip', 's3://mips-main/initial_data_collection/raw_data/brandwatch/2018_03_07_to_2020_05_02_onlyFb_2033753991_MainQuery_FbIgOnly.csv.zip', 's3://mips-main/initial_data_collection/raw_data/brandwatch/2018_03_08_to_2018_03_09_withoutFb_2033750044_MIPs+test.csv.zip', 's3://mips-main/initial_data_collection/raw_data/brandwatch/2018_03_10_to_2018_03_12_withFb_2033755725_MainQuery.csv.zip', 's3://mips-main/initial_data_collection/raw_data/brandwatch/2018_03_13_to_2018_03_13_withoutFb_2033770110_MainQuery_withoutFb.csv.zip', 's3://mips-main/initial_data_collection/raw_data/brandwatch/2018_03_14_to_2018_03_14_withoutFb_2033776708_Ma

In [5]:

# let data manager handle data
data_manager = ing.DataManager(all_osn_msgs_df, ".")

# preprocess
data_manager.preprocess(news_domain_classes_df, START_DATE, END_DATE)


Filter out dates...
#Remove nan...
Add article count column...
add msg_id...
identify news_domains...
identify class of each news_domain...
counts of each class marked at each class_X column...


In [6]:

# generate tables
min_msg_count = MIN_ACTIVITY_PER_MONTH * (END_DATE - START_DATE).days // 30

print(f"minimum message count : {min_msg_count}")

data_manager.generate_data_tables(MIN_PLAT_SIZE, min_msg_count)


minimum message count : 62
generating user_id values ...
updating all_osn_msgs ....
generating actor_id values for platforms ...
generating actor_id values for individuals ...
saving data files to disk ...
Dataframe: users_df 	 shape: (248011, 3)
Saving to : /home/ec2-user/SageMaker/ing/test/users_df.csv.zip
Dataframe: all_osn_msgs_df 	 shape: (1057705, 18)
Saving to : /home/ec2-user/SageMaker/ing/test/all_osn_msgs_df.csv.zip
Dataframe: actors_df 	 shape: (1993, 4)
Saving to : /home/ec2-user/SageMaker/ing/test/actors_df.csv.zip
Dataframe: indv_actors_df 	 shape: (1981, 5)
Saving to : /home/ec2-user/SageMaker/ing/test/indv_actors_df.csv.zip
Dataframe: plat_actors_df 	 shape: (12, 2)
Saving to : /home/ec2-user/SageMaker/ing/test/plat_actors_df.csv.zip
data table generation completed.


In [7]:
print(f"Start: {START_DATE} \nEnd: {END_DATE}")
print(f"Period length: {END_DATE - START_DATE}")
print(f"Expected actor min messages per month: {MIN_ACTIVITY_PER_MONTH}")
print(f"Min msg count: {min_msg_count}")
print(f"Users with more than {min_msg_count} messsages posted\n",
      data_manager.all_users[data_manager.all_users["msgs_count"] > min_msg_count].reset_index())

print(f"Platform Actors with more than {MIN_PLAT_SIZE} users\n",
      data_manager.actors_df[(data_manager.actors_df["actor_type"] == "plat") &
                             (data_manager.actors_df["num_users"] > MIN_PLAT_SIZE)].reset_index())

Start: 2018-03-01 00:00:00+00:00 
End: 2018-05-02 00:00:00+00:00
Period length: 62 days, 0:00:00
Expected actor min messages per month: 30
Min msg count: 62
Users with more than 62 messsages posted
       user_id         platform     source_user_id  msgs_count
0        u232        4chan.org          Anonymous       541.0
1        u409          8ch.net          Anonymous       105.0
2       u4701  dailymail.co.uk  Press Association        83.0
3       u4703  dailymail.co.uk            Reuters       166.0
4       u5368   digitalspy.com      NatashaSmythe        84.0
...       ...              ...                ...         ...
1925  u245698      youtube.com         Diana King       164.0
1926  u246736      youtube.com          PigMine 7        85.0
1927  u246791      youtube.com              RT UK        77.0
1928  u246887      youtube.com             Ruptly        85.0
1929  u247181      youtube.com         Today News        74.0

[1930 rows x 4 columns]
Platform Actors with more than 5

In [None]:
print("calculating te...")
te_calculator = ing.TransferEntropyCalculator(data_manager)
current_start_date = START_DATE
current_end_date = END_DATE
te_df = te_calculator.calculate_te_network(current_start_date, current_end_date, FREQUENCY)

print("all done!")



calculating te...
calculating actor timeseries dictionaries...
 E:a0 
 E:a1 
 E:a2 
 E:a3 
 E:a4 
 E:a5 
 E:a6 
 E:a7 
 E:a8 
 E:a9 
 E:a10 
 E:a11 
 E:a4437 
 E:a4614 
 E:a8906 
 E:a8908 
 E:a9573 
 E:a9616 
 E:a9632 
 E:a10963 
 E:a11477 
 E:a11695 
 E:a11742 
 E:a12307 
 E:a19436 
 E:a19940 
 E:a20957 
 E:a20984 
 E:a21733 
 E:a22059 
 E:a22060 
 E:a23580 
 E:a26777 
 E:a26787 
 E:a26800 
 E:a26825 
 E:a26953 
 E:a27040 
 E:a27167 
 E:a27258 
 E:a27274 
 E:a27333 
 E:a27510 
 E:a27518 
 E:a27778 
 E:a27883 
 E:a27885 
 E:a27892 
 E:a28044 
 E:a28059 
 E:a28102 
 E:a28129 
 E:a28133 
 E:a28144 
 E:a28177 
 E:a28193 
 E:a28255 
 E:a28263 
 E:a28311 
 E:a28431 
 E:a28460 
 E:a28487 
 E:a28622 
 E:a28638 
 E:a28974 
 E:a29039 
 E:a29106 
 E:a29145 
 E:a29249 
 E:a29316 
 E:a29332 
 E:a29380 
 E:a29601 
 E:a29701 
 E:a29887 
 E:a29897 
 E:a29985 
 E:a30419 
 E:a30534 
 E:a30597 
 E:a30736 
 E:a30738 
 E:a30746 
 E:a30748 
 E:a30936 
 E:a31199 
 E:a31258 
 E:a31270 
 E:a31293 
 E:a31305 


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
compression_options = dict(method='zip', archive_name='actor_te_edges_df.csv')
te_df.to_csv("actor_te_edges_df.csv.zip", index=False, compression=compression_options)
print("saved")

In [None]:
print("done")

In [None]:
te_df