# Notebook: Create Splits for Annotation

This notebook is used to split the dataset collected from Tripadvisor into several parts that can be hand out for annotation.

## Packages

In [21]:
import pandas as pd
import os

## Parameters

In [22]:
DATASET_PATH = "../datasets/balanced_reviews_sentences.csv"
SPLIT_SIZE = 300
SPLITS_PATH = 'splits_not_annotated/'

## Code

In [23]:
if not os.path.exists(SPLITS_PATH):
    os.makedirs(SPLITS_PATH)

In [24]:
df = pd.read_csv(DATASET_PATH)
total_rows = len(df)
num_splits = total_rows // SPLIT_SIZE + (1 if total_rows % SPLIT_SIZE > 0 else 0)
num_splits

10

In [25]:
for i in range(num_splits):
    start_idx = i * SPLIT_SIZE
    end_idx = start_idx + SPLIT_SIZE
    split_df = df[start_idx:end_idx]
    split_df = split_df[['restaurant_id', 'review_id', 'text']].reset_index(drop=True)
    output_filename = os.path.join(SPLITS_PATH, f'split_{i+1}.csv')
    split_df.to_csv(output_filename, index=False)
    print(f"Split: {i+1}, Gespeichert: {output_filename}, Größe: {len(split_df)}")

Split: 1, Gespeichert: splits_not_annotated/split_1.csv, Größe: 300
Split: 2, Gespeichert: splits_not_annotated/split_2.csv, Größe: 300
Split: 3, Gespeichert: splits_not_annotated/split_3.csv, Größe: 300
Split: 4, Gespeichert: splits_not_annotated/split_4.csv, Größe: 300
Split: 5, Gespeichert: splits_not_annotated/split_5.csv, Größe: 300
Split: 6, Gespeichert: splits_not_annotated/split_6.csv, Größe: 300
Split: 7, Gespeichert: splits_not_annotated/split_7.csv, Größe: 300
Split: 8, Gespeichert: splits_not_annotated/split_8.csv, Größe: 300
Split: 9, Gespeichert: splits_not_annotated/split_9.csv, Größe: 300
Split: 10, Gespeichert: splits_not_annotated/split_10.csv, Größe: 300
