In [13]:
import pandas as pd
import glob
import os

In [14]:
PATH_TO_ANNOTATIONS = r'CrisisMMD/annotations/'
TWEETS = r'CrisisMMD/tweets/'
ANNOTATION_FILES = glob.glob(PATH_TO_ANNOTATIONS + "/*.tsv")

In [15]:
def create_dics(path):
    try:
        os.mkdir(path)
    except OSError:
        print ("Creation of the directory %s failed" % path)
    else:
        print ("Successfully created the directory %s" % path)
    return


def save_tweets(dfs_inf, dfs_noninf):
    """
    Saves informative and non-informative tweets separately
    :return: None
    """
    create_dics(TWEETS)
    df_info = pd.concat([x for x in dfs_inf], axis=0)
    df_info.to_csv(TWEETS+"MMD_informative.csv", index=False)
    
    df_noninfo = pd.concat([x for x in dfs_noninf], axis=0)
    df_noninfo.to_csv(TWEETS+"MMD_non_informative.csv", index=False)
    return
    
    
def extract_tweets():
    """
    Extract tweets from the dataset. Save the dataframes with inf and non-inf data
    :return: None
    """
    dfs_inf = []
    dfs_noninf = []
    keep_col = ['tweet_text']
    for data in ANNOTATION_FILES:
        df = pd.read_csv(data, sep='\t', error_bad_lines=False)
        # extract informative tweets with informative images
        df_informative = df.loc[(df['text_info']=="informative") & (df['image_info']=="informative"),keep_col]
        # extract non-informative tweets with non-informative images
        df_non_informative = df.loc[(df['text_info']=="not_informative") & (df['image_info']=="not_informative"),keep_col]
        dfs_inf.append(df_informative)
        dfs_noninf.append(df_non_informative)
    save_tweets(dfs_inf, dfs_noninf)
    return


In [16]:
if __name__ == '__main__':
    try:
        extract_tweets()
    except Exception as e:
        print("Exception occured::\n",e)

Successfully created the directory CrisisMMD/tweets/
