In [44]:
import os
import pandas as pd
import shutil
from shutil import copy
import nltk
import re
from nltk.corpus import stopwords
nltk.download('stopwords')

In [37]:
REL = '/home4/shubham/Disaster-Response-and-Damage-Assessment/'
TRAIN = REL + 'dataset_csv/task_humanitarian_train_data.csv'
VAL = REL + 'dataset_csv/task_humanitarian_val_data.csv'
TEST = REL + 'dataset_csv/task_humanitarian_test_data.csv'

TWEETS_PATH = REL + 'dataset/tweets/'

IMAGES_TRAIN = REL + 'dataset/images/Train/'
IMAGES_VAL = REL + 'dataset/images/Val/'
IMAGES_TEST = REL + 'dataset/images/Test/'
CRISISMMD = REL + 'CrisisMMD_v2.0/'
CATEGORIES = ['not_humanitarian', 'other_relevant_information', 'rescue_volunteering_or_donation_effort', 'infrastructure_and_utility_damage', 'affected_individuals']

In [41]:
def preprocess_tweets(tweet):
    """
    this function removes urls, stopwords and punctuations from the tweets
    :params: tweet = tweets
    
    """

    tweet = str(tweet)
    # Removing URL mentions
    tweet = ' '.join(re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", tweet).split())
    tweet = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",tweet).split())
    # Removing stopwords
    stop  = stopwords.words('english')
    tweet =' '.join([word for word in tweet.split() if word not in (stop)])
    # Removing punctuations
    tweet = tweet.replace('[^\w\s]','') 
    tweet = tweet.lower()
    
    return tweet

def read_csv():
    """
    Reads csv file from train, test and validation path.
    """
    train_df = pd.read_csv(TRAIN)
    test_df = pd.read_csv(TEST)
    val_df = pd.read_csv(VAL)
    
    return train_df, val_df, test_df

def create_tweets_dataset(train, val, test):
    """
    Keeps only the required columns, Preprocesses tweets and creates tweets dataset.
    :param train/val/test: train/val/test dataframe objects.
    """
    cols = ['event_name', 'tweet_id', 'tweet_text', 'label']
    tweets_train = train[cols].copy()
    tweets_val = val[cols].copy()
    tweets_test = test[cols].copy()
    
    tweets_train.loc[:,'tweet_text'] = tweets_train.apply(lambda x: preprocess_tweets(x['tweet_text']), axis= 1)
    tweets_val.loc[:,'tweet_text'] = tweets_val.apply(lambda x: preprocess_tweets(x['tweet_text']), axis= 1)
    tweets_test.loc[:,'tweet_text'] = tweets_test.apply(lambda x: preprocess_tweets(x['tweet_text']), axis= 1)
    
    tweets_train.to_csv(TWEETS_PATH + 'train_tweets.csv', index=False)
    tweets_val.to_csv(TWEETS_PATH + 'val_tweets.csv', index=False)
    tweets_test.to_csv(TWEETS_PATH + 'test_tweets.csv', index=False)
    
def make_dirs():
    """
    Creates directory for each class.
    """
    for cat in CATEGORIES:
        if not os.path.isdir(IMAGES_TRAIN + cat):
            os.makedirs(IMAGES_TRAIN + cat)
        if not os.path.isdir(IMAGES_VAL + cat):
            os.makedirs(IMAGES_VAL + cat)
        if not os.path.isdir(IMAGES_TEST + cat):
            os.makedirs(IMAGES_TEST + cat)
            
def create_image_dataset(df, main_dir):
    """
    Copies image from CrisisMMD folder to appropriate class named folders.
    :param df: train/test/val df
    :param main_dir: path to train/test/val
    """
    for index, row in df.iterrows():
        image_path = row['image']
        image_name = image_path.split('/')[-1]
        image_label = row['label']
        source = CRISISMMD + image_path
        destination = main_dir + image_label
        if os.path.isfile(destination + '/' + image_name):
            print("Image: ",image_name, " already present!")
            continue
        else:
            copy(source, destination)

In [43]:
# Read data
train_df, val_df, test_df = read_csv()

# Preprocess and create final tweets dataset
create_tweets_dataset(train_df, val_df, test_df)

# Create directory structure for images
make_dirs()

# Copy images from CrisisMMD to specified folders
create_image_dataset(train_df, IMAGES_TRAIN)
create_image_dataset(val_df, IMAGES_VAL)
create_image_dataset(test_df, IMAGES_TEST)