In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("preprocessed_posts.csv")

Create train/val/test dataframes

In [3]:
train_files = [x.replace(".txt", "") for x in os.listdir("./sampled_train")]
test_files = [x.replace(".txt", "") for x in os.listdir("./sampled_test")]

In [4]:
train_val_df = df[df["file_id"].isin(train_files)]
test_df = df[df["file_id"].isin(test_files)]

In [5]:
train_ids, val_ids = train_test_split(train_val_df.index.tolist(), 
                                      shuffle=True,
                                      stratify=train_val_df.num_label.tolist(),
                                      test_size=0.2)

In [6]:
train_df = train_val_df.loc[train_ids]
val_df = train_val_df.loc[val_ids]

In [7]:
print("Train df percentages")
print(train_df.groupby("num_label")["file_id"].count()/len(train_df.index))
print()
print("Test df percentages")
print(test_df.groupby("num_label")["file_id"].count()/len(test_df.index))

Train df percentages
num_label
0    0.499673
1    0.500327
Name: file_id, dtype: float64

Test df percentages
num_label
0    0.5
1    0.5
Name: file_id, dtype: float64


## Notice that the "stratify" arg in the train test split thing balanced the classes, but in doing so removed most of the posts.  
- The classes are now balanced but we don't have that much data to work with

In [8]:
print(len(df.index))
print(len(train_val_df.index))
print(len(train_val_df[train_val_df["num_label"] == 1]["file_id"].unique()))
print(len(train_val_df[train_val_df["num_label"] == 0]["file_id"].unique()))

10944
1914
957
957


In [9]:
train_df.to_csv("train_df.csv", index=False)
val_df.to_csv("val_df.csv", index=False)
test_df.to_csv("test_df.csv", index=False)