<a href="https://colab.research.google.com/github/SocialMediaLab/Tweets_Sampling_Toolkit/blob/main/Any_CSV_Sampling_Script.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Documentation: https://docs.google.com/document/d/165K_EQBI1VquJHaJtxN97qFhNg_0EJ7Fjp4DIDHNHcI/edit **

# Mount your google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Random Sampling Function

In [None]:
import csv
import os
import math
import subprocess
import time
import numpy as np
import pandas as pd
from tqdm import tqdm
import zipfile

# Check if file is of .csv type
def check_csv(input_file_path):
  file_type = input_file_path.split(".")[-1]
  return file_type == 'csv'

# Check if file is of .zip type
def check_zip(input_file_path):
  file_type = input_file_path.split(".")[-1]
  return file_type == 'zip'

# Unzip zip files
def unzip(input_file_path, output_dir):

  print(f'Unzipping {input_file_path} to {output_dir}')

  if not os.path.exists(output_dir):
    os.makedirs(output_dir)

  try:
      with zipfile.ZipFile(input_file_path, 'r') as zip_ref:
          zip_ref.extractall(output_dir)
          extracted_files = zip_ref.namelist()
  except zipfile.BadZipFile:
      print(f"Error: The file {input_file_path} is not a valid zip file.")
      return []

  out_files = [os.path.join(output_dir, e) for e in extracted_files]

  print(f'Done unzipping {input_file_path} to {output_dir}')

  return out_files

def get_chunk_counts(input_file_path, chunksize):
  with open(input_file_path, 'r') as fp:
    reader = csv.reader(fp)
    row_count = sum(1 for _ in reader) - 1

  num_chunks = math.ceil(row_count / chunksize)
  return num_chunks

# Chunk a file
def chunk_file(input_file_path, sample_percent=0.01, chunksize=10000):
  # Step 1: Get the number of rows
  with open(input_file_path, 'r') as fp:
    reader = csv.reader(fp)
    row_count = sum(1 for _ in reader) - 1

  # Step 2: Check if file is small enough to read in one go
  if (chunksize >= row_count and row_count <= 50000):
    return pd.read_csv(input_file_path).sample(frac=sample_percent)

  # Step 3: Determine the number of chunks
  num_chunks = math.ceil(row_count / chunksize)
  nrows = [chunksize]*(num_chunks)
  nrows[-1] = row_count - (chunksize*(num_chunks - 1))

  # Step 4: Iterate through each line of the file
  with tqdm(total=num_chunks, desc="Processing CSV", unit="chunk") as pbar:
    with open(input_file_path, 'r') as fp:

      # Get the header
      reader = csv.reader(fp)
      header = next(reader)
      num_col = len(header)

      # Counter for tracking row in chunks and skipped lines
      counts = 0
      skipped = 0

      chunk_data = []
      samples = []

      # Iterate through each line
      for row in reader:
        row_len = len(row)

        if row_len > num_col:
          skipped += 1
          continue
        elif row_len < num_col:
          skipped += 1
          continue
        else:
          chunk_data.append(row)

        counts += 1

        # Check if count matches the current number of rows to be read with respect to a chunk
        #   - If yes, then sample the data and append it to the samples array
        if counts == nrows[0]:
          nrows.pop(0)
          temp = pd.DataFrame(chunk_data, columns=header)
          temp_samples = temp.sample(frac=sample_percent)
          samples.append(temp_samples)
          chunk_data = []
          counts = 0
          pbar.update()

  # Step 5: Create a dataframe containing all samples and return
  sample_df = pd.concat(samples)
  print(f'Expected Samples: {row_count * sample_percent}, Actual Samples: {sample_df.shape[0]}, Skipped: {skipped}')

  return row_count, sample_df

# Better and faster
def chunk_file(input_file_path, sample_percent, chunksize):

  samples = []
  total_rows = 0
  chunks = pd.read_csv(input_file_path, chunksize=chunksize)
  num_chunks = get_chunk_counts(input_file_path, chunksize)

  with tqdm(total=num_chunks, desc="Processing CSV", unit="chunk", leave=True) as pbar:
    for chunk in chunks:
      samples.append(chunk.sample(frac=sample_percent))
      total_rows += chunk.shape[0]
      pbar.update(1)

  sample_df = pd.concat(samples)

  return total_rows, sample_df

# Sampling file code
def random_sampler(input_file_path, output_file_path, sample_percent=0.01, chunksize=10000):
  """
  # Purpose:
    * Sample a dataset and save the sampled dataset to another file
  # Args:
    * input_file_path (str): path to the file to be sampled
    * output_file_path (str): path to the sampled file
    * sample_percent (float): percentage of dataset to be sampled
    * chunksize (int): how many rows are to be read at a time
  """
  # Count rows, samples
  total_rows = 0
  samples = []

  # If zip file, unzip all files to the output directory and sample from each file
  if check_zip(input_file_path):
    output_dir = "/".join(output_file_path.split('/')[:-1])

    for file_name in unzip(input_file_path, output_dir):
      if not file_name:
          continue

      if not check_csv(file_name):
        continue

      print(f'Processing file: {file_name}')
      rows, sp = chunk_file(file_name, sample_percent, chunksize)
      total_rows += rows
      samples.append(sp)

  # If csv then sample then sample from csv
  elif check_csv(input_file_path):
      print(f'Processing file: {input_file_path}')
      rows, sp = chunk_file(input_file_path, sample_percent, chunksize)
      total_rows += rows
      samples.append(sp)

  # Check if samples exist
  if samples:
      sampled_df = pd.concat(samples)

      # Summary print
      print(f'Total Dataset Size: {total_rows}')
      print(f'Expected Dataset Size: {total_rows*sample_percent}')
      print(f'Sampled Dataset Size: {sampled_df.shape[0]}')

      # Save sampled dataset to output file
      sampled_df.to_csv(output_file_path, index=False)
      return sampled_df

  else:
      print("No samples were extracted.")
      return None

## Usage

### Sampling a csv

In [None]:
# File that you want sampled
input_file_path ="/content/drive/../input.csv"

# Path that you want the sampled dataset to be saved to
output_file_path = "/content/drive/../output.csv"

# Percentage of how much of the dataset you want to be sampled
sample_percent = 0.01

# Number of rows to be read at a time
chunksize = 40000

samples = random_sampler(input_file_path=input_file_path, output_file_path=output_file_path, sample_percent=sample_percent, chunksize=chunksize)

### Sampling a zip

In [None]:
# File that you want sampled
input_file_path = "/content/drive/../input.zip"
# Path that you want the sampled dataset to be saved to
output_file_path = "/content/drive/../output.zip"

# Percentage of how much of the dataset you want to be sampled
sample_percent = 0.01

# Number of rows to be read at a time
chunksize = 10000


samples = random_sampler(input_file_path=input_file_path, output_file_path=output_file_path, sample_percent=sample_percent, chunksize=chunksize)