In [1]:
import json
import csv
import pandas as pd
import glob
import os
import datetime
import random
import copy
import pickle
from sklearn.model_selection import train_test_split
from pyod.models.iforest import IForest
from joblib import dump, load


In [2]:
def generate_dates(start_date, end_date):
    lst =  pd.date_range(start_date, end_date, freq='D')
    
    list_date = []
    for i in range(len(lst)):
        list_date.append(lst[i].date().strftime("%Y-%m-%d"))
    return list_date

def findNaN(df, column):
    col = list(df[column])
    nan_rows = []
    for i in range(len(col)):
        if pd.isna(col[i]):
            nan_rows.append(i)
    return nan_rows
            

In [3]:
dates = generate_dates('2021-06-20','2021-07-20')
country = "CN"
ls=[]
for date in dates:
    filename = "/data/censorship/OONI/"+date+"/"+country+"/groundtruth_combined.csv"
    if os.path.exists(filename):
        df = pd.read_csv(filename)
        ls.append(df)
#         print(df.columns)
#         for col in df.columns:
#             print(col)
#             print(df[col].unique())
combined = pd.concat(ls)
columns_selected_test = ['measurement_start_time',
       'probe_asn','probe_network_name',
       'resolver_asn','resolver_network_name','test_runtime',
       'test_start_time', 'dns_experiment_failure', 'dns_consistency',
       'control_failure', 'http_experiment_failure', 'body_length_match',
       'body_proportion', 'status_code_match', 'headers_match', 'title_match',
       'accessible', 'blocking', 'x_status', 'test_keys_asn',
       'test_keys_as_org_name', 'test_keys_ipv4',"GFWatchblocking_truth"]

combined = combined[columns_selected_test]
print(combined.columns)



Index(['measurement_start_time', 'probe_asn', 'probe_network_name',
       'resolver_asn', 'resolver_network_name', 'test_runtime',
       'test_start_time', 'dns_experiment_failure', 'dns_consistency',
       'control_failure', 'http_experiment_failure', 'body_length_match',
       'body_proportion', 'status_code_match', 'headers_match', 'title_match',
       'accessible', 'blocking', 'x_status', 'test_keys_asn',
       'test_keys_as_org_name', 'test_keys_ipv4', 'GFWatchblocking_truth'],
      dtype='object')


# Modifying the columns to make it ML runnable

In [4]:
def convert_measurement_starttime(time):
    converted_datetime = pd.to_datetime(time, format='%Y-%m-%d %H:%M:%S')
  
    benchmark = datetime.datetime(2021,6,20)
    difference = converted_datetime-benchmark
    return difference.total_seconds()
def relabel_category(df, column):
    df = replace_nan(df,column)
    unique_labels = list(df[column].unique())
    dict_label = {}

    for i in range(len(unique_labels)):
        dict_label[unique_labels[i]]=i
    df[column]=[dict_label[val] for val in df[column]]
    
    labels_df = pd.DataFrame()
    labels_df["Old label"]=unique_labels
    labels_df["New label"]=[i for i in range(len(unique_labels))]
    return df, labels_df

def replace_nan(df, column):
    new_labels = []
    for val in df[column]:
        if pd.isna(val):
            new_labels.append("")
        else:
            new_labels.append(val)
    df[column]=new_labels
    return df
    


In [5]:
#### removing rows that do not pass control test

combined = replace_nan(combined,"control_failure")
combined = combined[combined["control_failure"]==""]

In [6]:
###### Modify time ####
combined["measurement_start_time"] = [convert_measurement_starttime(time) for time in combined["measurement_start_time"]]
combined["test_start_time"] = [convert_measurement_starttime(time) for time in combined["test_start_time"]]

###### Modify resolver ASN
resolver_asn = []
for asn in combined["resolver_asn"]:
    if pd.isna(asn):
        resolver_asn.append("AS")
    else:
        resolver_asn.append(asn)
combined["resolver_asn"]=resolver_asn

###### Modify probe ASN

resolver_asn = []
for asn in combined["probe_asn"]:
    if pd.isna(asn):
        resolver_asn.append("AS")
    else:
        resolver_asn.append(asn)
combined["probe_asn"]=resolver_asn
 
    
#### Modifying the ASN ######
combined, labels_combined = relabel_category(combined,"resolver_asn")
combined, labels_combined = relabel_category(combined,"probe_asn")

#### Modifying network name ####
combined, labels_combined = relabel_category(combined,"probe_network_name")
combined, labels_combined = relabel_category(combined,"resolver_network_name")


### Modify other columns
combined, labels_combined = relabel_category(combined,"status_code_match")
combined, labels_combined = relabel_category(combined,"headers_match")
combined, labels_combined = relabel_category(combined,"title_match")
combined, labels_combined = relabel_category(combined,"accessible")
combined, labels_combined = relabel_category(combined,"body_length_match")
combined, labels_combined = relabel_category(combined,"x_status")
combined, labels_combined = relabel_category(combined,"test_keys_asn")
combined, labels_combined = relabel_category(combined,"test_keys_as_org_name")
combined, labels_combined = relabel_category(combined,"test_keys_ipv4")
combined, labels_combined = relabel_category(combined,"dns_consistency")
# combined, labels_combined = relabel_category(combined,"control_failure")
combined, labels_combined = relabel_category(combined,"http_experiment_failure")

dns_failure = []
for error in combined["dns_experiment_failure"]:
    if not pd.isna(error):
        if "unknown_failure" in error:
            dns_failure.append(error.split(":")[-1].strip())
        else:
            dns_failure.append(error)
    else:
        dns_failure.append("")
        
GFWblocking_truth = []
for error in combined["GFWatchblocking_truth"]:
    if pd.isna(error):
        GFWblocking_truth.append("")
        
    else:
        GFWblocking_truth.append(error)        

combined["GFWatchblocking_truth"]=GFWblocking_truth

combined, labels_combined = relabel_category(combined,"dns_experiment_failure")


# Get Pure data for training


In [7]:
pure_data = combined[combined["GFWatchblocking_truth"]==""]
pure_data = pure_data[pure_data["blocking"]=="False"]
pure_data = pure_data.drop(["GFWatchblocking_truth","blocking"],axis=1)


# DNS blocking

In [8]:
ooni_blocking = combined[combined["blocking"]=="dns"]


In [9]:
GFWatch_blocking = combined[combined["GFWatchblocking_truth"]=="Confirmed"]


# RUNNING MACHINE LEARNING

In [10]:
from sklearn.model_selection import train_test_split

training_data, testing_data = train_test_split(pure_data, test_size=0.2, random_state=25)

test_ooni = pd.concat([testing_data,ooni_blocking])
test_GF = pd.concat([testing_data, GFWatch_blocking])



In [12]:

# Model parameters
contamination = 0.03164651 # approximately 3.164651% of all samples are anomalies in this dataset TODO this needs to be changed for every dataset
max_features = 9 # TODO make this higher?
n_estimators = 200
bootstrap = True
n_jobs = 20
random_state = 123
verbose = True

print(model_folder_name)

# Set up the model
iforest = IForest(contamination=contamination,
                  max_features=max_features,
                  bootstrap=bootstrap,
                  n_jobs=n_jobs,
                  random_state=random_state,
                  verbose=verbose)

# For CLASS PROJECT


In [9]:
def analyse_by_AS(combined, AS):
    AS_dat = combined[combined["probe_asn"]==AS]
       #### remove all the nan results that OONI cannot get the results
    nan_results = findNaN(AS_dat,"blocking")
    non_nan = list(set(list(range(len(AS_dat)))) - set(nan_results))
    AS_dat = AS_dat.iloc[non_nan]
  
#     print("Accessible")
#     print(AS_dat["accessible"].unique())
#     print("Blocking")
#     print(AS_dat["blocking"].unique())
#     print("DNS blocking truth")
#     print(AS_dat["dns_blocking_truth"].unique())
    
    

#### the domains that OONI says is blocked at dns level
    OONI_block = AS_dat[AS_dat["blocking"]=="dns"]
    not_confirmed_block = findNaN(OONI_block,"dns_blocking_truth")
    confirmed_block = list(set(list(range(len(OONI_block)))) - set(not_confirmed_block))

# ####The domains that are reported not to be blocked in OONI
    OONI_notblock = AS_dat[AS_dat["blocking"]=="False"]
    confirmed_notblock = findNaN(OONI_notblock,"dns_blocking_truth")
    not_confirmed_notblock = list(set(list(range(len(OONI_notblock)))) - set(confirmed_notblock)) 
    
    not_confirmed_block_df = OONI_block.iloc[not_confirmed_block]
    confirmed_block_df = OONI_block.iloc[confirmed_block]
    not_confirmed_not_block_df = OONI_notblock.iloc[not_confirmed_notblock]
    confirmed_not_block_df = OONI_notblock.iloc[confirmed_notblock]
    return confirmed_block_df, confirmed_not_block_df, not_confirmed_block_df, not_confirmed_not_block_df
    
    
    
    


        


In [11]:
ncfb = []
ncfnb = []

for i in range(15):
    AS = sorted_dic_asn[i][0]
    print(AS)
    block_cf, notblock_cf, ncf_block, ncf_nblock = analyse_by_AS(combined, AS)
    print(" Blocked by OONI but not GFWatch" )
    print(ncf_block.shape)
    ncfb.append(ncf_block)
    
    
    print(" Not blocked by OONI but blocked by GFWatch")
    print(ncf_nblock.shape)
#     print(ncf_nblock.shape)
    ncfnb.append(ncf_nblock)
    



    

AS17962
 Blocked by OONI but not GFWatch
(118, 34)
 Not blocked by OONI but blocked by GFWatch
(206867, 34)
AS4134
 Blocked by OONI but not GFWatch
(209, 34)
 Not blocked by OONI but blocked by GFWatch
(137566, 34)
AS134773
 Blocked by OONI but not GFWatch
(70, 34)
 Not blocked by OONI but blocked by GFWatch
(28693, 34)
AS4837
 Blocked by OONI but not GFWatch
(34, 34)
 Not blocked by OONI but blocked by GFWatch
(27017, 34)
AS56048
 Blocked by OONI but not GFWatch
(30, 34)
 Not blocked by OONI but blocked by GFWatch
(22362, 34)
AS17621
 Blocked by OONI but not GFWatch
(0, 34)
 Not blocked by OONI but blocked by GFWatch
(9923, 34)
AS9808
 Blocked by OONI but not GFWatch
(3, 34)
 Not blocked by OONI but blocked by GFWatch
(8215, 34)
AS140308
 Blocked by OONI but not GFWatch
(255, 34)
 Not blocked by OONI but blocked by GFWatch
(3840, 34)
AS137692
 Blocked by OONI but not GFWatch
(0, 34)
 Not blocked by OONI but blocked by GFWatch
(5675, 34)
AS56046
 Blocked by OONI but not GFWatch
(96, 34

In [14]:
ncb = pd.concat(ncfb)
ncnb = pd.concat(ncfnb)

ncb.to_csv("/data/censorship/OONI/blocked_OONI_notGF.csv")

In [15]:
ncnb.to_csv("/data/censorship/OONI/blocked_GF_notOONI.csv")

In [None]:
def check_