In [9]:
import numpy as np
import os
import random
import pickle
from torchvision import transforms
import torch
from sklearn.model_selection import train_test_split

In [3]:
data_dir = '/sda/rina_1921cs13/Nischal/NovFake/data/'
text_arr = np.load(os.path.join(data_dir, 'text_array.npy'))
labels_arr = np.load(os.path.join(data_dir, 'labels.npy')).squeeze() # 0 fake, 1 real
ids_arr = np.load(os.path.join(data_dir, 'ids.npy')).squeeze()
event_arr = np.load('../Data/nis_event_labels.npy').squeeze()

In [4]:
image_arr = np.load(os.path.join(data_dir, 'image_array.npy')).squeeze()

In [5]:
num_images, sources, width, height, num_channels = image_arr.shape
img_data_reshape = np.reshape(image_arr, newshape=(num_images, sources, num_channels, width, height))
img_data_target = img_data_reshape[:,0,:,:,:] # Don't convert to GPU
print('New Target Shape', img_data_target.shape)

New Target Shape (11766, 3, 224, 224)


In [6]:
ids_arr

array(['Snp1_3_1.jpg', 'Snp3_1_19.jpg', 'Snp3_2_4.jpg', ...,
       'Tcn16641_3_1.jpg', 'Tcn16642_1_8.jpg', 'Tcn16642_2_7.jpg'],
      dtype='<U19')

In [7]:
img_data_target[0].shape

(3, 224, 224)

# Process Data

In [7]:
def get_short_string(string):
    return string[0:255]

In [8]:
data_transforms = transforms.Compose([
            transforms.ToPILImage(),
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
            ])

In [9]:
fake_data = []
real_data = []
image_list = {}
for i, label in enumerate(labels_arr):
    line_1_pad = '|N'*14
    line_1 = ids_arr[i].split('.')[0] + line_1_pad 
    line_2 = ids_arr[i] + '|null'
    short_string = get_short_string(text_arr[i][0])
    assert len(short_string) <= 255
    line_3 = str(short_string)
    pack_data = [line_1, line_2, line_3]
    image_list[ids_arr[i].split('.')[0].lower()] = data_transforms(img_data_target[i].astype(np.uint8))
    if label==0:
        fake_data.append(pack_data)
    elif label==1:
        real_data.append(pack_data)
print('Loaded Data')
print('Real Data Length', len(real_data))
print('Fake Data Length', len(fake_data))

Loaded Data
Real Data Length 6816
Fake Data Length 4950


In [17]:
image_list['snp1_3_1'].shape

torch.Size([3, 224, 224])

In [11]:
fake_event = []
real_event = []
fake_ids = []
real_ids = []
for i, label in enumerate(labels_arr):
    if label==0:
        fake_event.append(event_arr[i])
        fake_ids.append(ids_arr[i].split('.')[0])
    elif label==1:
        real_event.append(event_arr[i])
        real_ids.append(ids_arr[i].split('.')[0])
train_fake_event, test_fake_event, train_fake_ids, test_fake_ids = train_test_split(fake_event, fake_ids, test_size=0.2, random_state=43)
train_real_event, test_real_event, train_real_ids, test_real_ids = train_test_split(real_event, real_ids, test_size=0.2, random_state=43)
train_f_event, val_f_event, train_f_ids, val_f_ids = train_test_split(train_fake_event, train_fake_ids, test_size=0.1, random_state=43)
train_r_event, val_r_event, train_r_ids, val_r_ids = train_test_split(train_real_event, train_real_ids, test_size=0.1, random_state=43)
# All events
train_events = train_f_event + train_r_event
val_events = val_f_event + val_r_event 
test_events = test_fake_event + test_real_event
# All ids
train_ids = train_f_ids + train_r_ids
val_ids = val_f_ids + val_r_ids 
test_ids = test_fake_ids + test_real_ids

# Write EANN Format

In [11]:
train_fake, test_fake  = train_test_split(fake_data, test_size=0.2, random_state=43)
train_real, test_real  = train_test_split(real_data, test_size=0.2, random_state=43)

In [12]:
def write_data(data_lst, filename):
    output_dir = '/sda/rina_1921cs13/Nischal/NovFake/sota_comp/EANN-KDD18/Data/weibo/tweets/'
    with open(os.path.join(output_dir, filename), 'w') as outfile:
        for data in data_lst:
            for data_lines in data:
                print(data_lines, file=outfile)

In [13]:
# Fake Data
write_data(train_fake, 'nis_train_fake.txt')
write_data(test_fake, 'nis_test_fake.txt')
# Real Data
write_data(train_real, 'nis_train_real.txt')
write_data(test_real, 'nis_test_real.txt')

In [12]:
def save_id_event_pickle(ids, events, filename):
    assert len(ids) == len(events)
    output_path = '/sda/rina_1921cs13/Nischal/NovFake/sota_comp/EANN-KDD18/Data/weibo/'
    store_dict = {}
    for i, id in enumerate(ids):
       store_dict[id] =  events[i]
    with open(os.path.join(output_path, filename), 'wb') as handle:
        pickle.dump(store_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [15]:
def save_image_pickle(image_list, filename):
    output_path = '/sda/rina_1921cs13/Nischal/NovFake/sota_comp/EANN-KDD18/Data/weibo/'
    with open(os.path.join(output_path, filename), 'wb') as handle:
        pickle.dump(image_list, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [13]:
# Storing event dicts
save_id_event_pickle(train_ids, train_events, 'nis_train_id.pickle')
save_id_event_pickle(val_ids, val_events, 'nis_validate_id.pickle')
save_id_event_pickle(test_ids, test_events, 'nis_test_id.pickle')
save_image_pickle(image_list, 'nis_image.pickle')