<a href="https://colab.research.google.com/github/dzanahmed/welcome-ideathon-lshtm/blob/main/code/create_manual_validation_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Creates a manual validation dataset, splits it for n screeners, and saves it locally

## load packages

In [42]:
import pandas as pd
import random
import numpy as np
import math
from google.colab import files

## load raw data

In [6]:
url = "https://raw.githubusercontent.com/dzanahmed/welcome-ideathon-lshtm/main/data/interim/vax_tweets_v0_filtered.csv?token=GHSAT0AAAAAACB5CGER57AR6RA6DLKPPER4ZFKSRVA"
raw_df = pd.read_csv(url)
print(raw_df.dtypes)

Unnamed: 0                int64
tweet_id                  int64
user_location            object
user_description         object
user_followers            int64
user_friends              int64
user_favourites           int64
user_verified              bool
date                     object
text                     object
hashtags                 object
is_retweet                 bool
roberta_loc_score       float64
roberta_loc_guess        object
distilBERT_sentiment     object
distilBERT_score        float64
dtype: object


## move relevant data into working dataframe

In [8]:
working_df = raw_df.loc[:, ['tweet_id', 'text']]
print(working_df.dtypes)

tweet_id     int64
text        object
dtype: object


## function to produce dataframe for manual validation

In [40]:
def create_manual_validation_dataframe(df, filter_names):

  all_tweet_ids = df['tweet_id'].tolist()

  sampled_tweet_ids = {}

  working_set = set()

  for i in range(len(filter_names)):

    sampled_tweet_ids[filter_names[i]] = random.sample(all_tweet_ids, k = 10000)

    working_set = working_set | set(sampled_tweet_ids[filter_names[i]])

  all_sampled_tweet_ids = list(working_set)

  validation_df = df[df['tweet_id'].isin(all_sampled_tweet_ids)]

  for filter in filter_names:

    validation_df[filter] = False
    validation_df[filter] = validation_df['tweet_id'].isin(sampled_tweet_ids[filter])

    validation_df[f"{filter}_decision"] = None

  return validation_df

## run function on working dataframe

In [39]:
# produce dataframe
output_df = create_manual_validation_dataframe(working_df, ['vaccine', 'hesitancy'])

# examine output
print(len(output_df))
output_df

18869


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  validation_df.loc[:, filter] = False
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  validation_df.loc[:, filter] = validation_df.loc[:, 'tweet_id'].isin(sampled_tweet_ids[filter])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  validation_df.loc[:, f"{filter}_decision"] = None
A value is trying to b

Unnamed: 0,tweet_id,text,vaccine,hesitancy,vaccine_decision,hesitancy_decision
11,12,This is not a political post - but as someone ...,False,True,,
14,15,"Off to get my J&amp;J, in case anyone cares......",False,True,,
28,30,The NHS will not ask you to apply for the COVI...,False,True,,
30,32,Fact vs myths. #COVID19India #CovidVaccine htt...,True,False,,
42,45,WELL WELL LOOK WHAT HAPPENED TO THE WOULD BE R...,False,True,,
...,...,...,...,...,...,...
90058,99982,📣 2021 was a monumental year\n\nFrom #CovidVac...,False,True,,
90061,99985,@churchoflazlo @hartzell965 why is @slimfast9...,True,True,,
90063,99987,DAY 1.\nNo side effects.\n\n#CovidVaccine\n#Co...,True,False,,
90067,99991,Hungary and the United Arab Emirates registere...,False,True,,


## function to split output dataframe into n files

In [44]:
def split_for_manual_validation(df, n_screeners):

  # Calculate the number of rows in each part
  num_rows = len(df)
  rows_per_part = math.ceil(num_rows / n_screeners)

  # Split the DataFrame into equal parts
  df_parts = np.array_split(df, n_screeners)

  for i, part in enumerate(df_parts):
    # Save as CSV
    part.to_csv(f'manual_validation_{i}.csv', index=False)
    files.download(f'manual_validation_{i}.csv')

    # Save as Excel
    part.to_excel(f'manual_validation_{i}.xlsx', index=False)
    files.download(f'manual_validation_{i}.xlsx')


## save files locally

In [45]:
split_for_manual_validation(output_df, n_screeners = 3)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>