In [5]:
import pandas as pd
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import boto3

In [6]:
bucket = 'fake-news-pranav'
prefix = 'data/raw/'

In [7]:
s3 = boto3.client('s3')
s3.download_file(bucket, f'{prefix}Fake.csv', 'Fake.csv')
s3.download_file(bucket, f'{prefix}True.csv', 'True.csv')

In [8]:
fake = pd.read_csv('Fake.csv')
true = pd.read_csv('True.csv')

In [9]:
fake['label'] = 0
true['label'] = 1

In [10]:
df = pd.concat([fake, true])
df = shuffle(df, random_state=42).reset_index(drop=True)

In [11]:
if 'date' in df.columns:
    df = df.drop(columns=['date'])

In [12]:
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42, stratify=df['label'])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['label'])

In [13]:
train_df.to_csv('train.csv', index=False)
val_df.to_csv('val.csv', index=False)
test_df.to_csv('test.csv', index=False)

print("Files created: train.csv, val.csv, test.csv")

Files created: train.csv, val.csv, test.csv


In [14]:
processed_prefix = 'data/processed/'

for file in ['train.csv', 'val.csv', 'test.csv']:
    s3.upload_file(file, bucket, f'{processed_prefix}{file}')
    print(f'Uploaded {file} to s3://{bucket}/{processed_prefix}')

Uploaded train.csv to s3://fake-news-pranav/data/processed/
Uploaded val.csv to s3://fake-news-pranav/data/processed/
Uploaded test.csv to s3://fake-news-pranav/data/processed/
