In [1]:
# pip install pandas

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
# pip install pyspark
# pip install pyarrow

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os, sys; os.environ["PYSPARK_PYTHON"] = sys.executable; os.environ["PYSPARK_DRIVER_PYTHON"] = "python3"; from pyspark.sql.functions import col; from pyspark.sql import SparkSession; import logging


In [3]:
import lmod
await lmod.purge(force=True)
await lmod.load('jdk/17.0.5')

In [4]:

spark = SparkSession.builder\
    .appName("Process Large JSON")\
    .config("spark.executor.memory", "96g")\
    .config("spark.driver.memory", "64g")\
    .getOrCreate()
file_path = "/project/swabhas_1457/Section_20243_30249_26/Data/Tweets/tweet_1.json"
spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/23 23:07:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:


# # Set the Python version for Spark driver and worker
# python_path = sys.executable  # Get the path of the currently used Python (which should be 3.12.7)

# # Set environment variables to ensure same Python version in both driver and workers
# os.environ["PYSPARK_PYTHON"] = python_path
# os.environ["PYSPARK_DRIVER_PYTHON"] = "python3"  # Ensure the driver uses Python 3.x

# # Verify the environment variables are set correctly
# print(f"PYSPARK_PYTHON is set to: {os.environ['PYSPARK_PYTHON']}")
# print(f"PYSPARK_DRIVER_PYTHON is set to: {os.environ['PYSPARK_DRIVER_PYTHON']}")

# print(sys.version)


In [6]:
# Set up logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger()

# Define paths
tweet_files_base_path = "/project/swabhas_1457/Section_20243_30249_26/Data/Tweets/tweet_"
label_file_path = "/project/swabhas_1457/Section_20243_30249_26/Data/label.csv"
split_file_path = "/project/swabhas_1457/Section_20243_30249_26/Data/split.csv"

output_dir = "/project/swabhas_1457/Section_20243_30249_26/Data/Tweet_SingleFile_Split/"
train_file = os.path.join(output_dir, "tweets_train_0.parquet")
val_file = os.path.join(output_dir, "tweets_val_0.parquet")
test_file = os.path.join(output_dir, "tweets_test_0.parquet")

# Step 1: Read labels and split data
logger.info("Reading labels and split data...")
labels_df = spark.read.csv(label_file_path, header=True).selectExpr("id as user_id", "label")
split_df = spark.read.csv(split_file_path, header=True).selectExpr("id as split_id", "split")
logger.info(f"Loaded labels data with {labels_df.count()} rows and split data with {split_df.count()} rows.")

# Check if the files exist and decide whether to append or overwrite
def save_to_parquet(df, file_path):
    if os.path.exists(file_path):
        logger.info(f"{file_path} exists, appending data...")
        df.write.mode("append").parquet(file_path)
    else:
        logger.info(f"{file_path} does not exist, creating a new file...")
        df.write.mode("overwrite").parquet(file_path)

# Step 2: Process each tweet file (from tweet_0.json to tweet_8.json)
for i in range(7,9):  # Assuming files are named tweet_0.json, tweet_1.json, ..., tweet_8.json
    tweet_file_path = f"{tweet_files_base_path}{i}.json"
    
    if os.path.exists(tweet_file_path):
        logger.info(f"Processing tweet file: {tweet_file_path}")
        
        # Step 2.1: Read tweets data
        tweets_df = spark.read.option("multiline", "true").json(tweet_file_path).select("author_id", "text", "lang").dropna()
        tweets_df = tweets_df.filter(col("lang") == "en").selectExpr("concat('u', author_id) as user_id", "text")
        logger.info(f"Loaded tweets data with {tweets_df.count()} rows from {tweet_file_path}.")
        
        # Step 2.2: Join with labels and split data
        tweets_labels_df = tweets_df.join(labels_df, on="user_id", how="inner") \
                                    .join(split_df, tweets_df.user_id == split_df.split_id, how="inner") \
                                    .drop("split_id")
        logger.info(f"Joined dataset contains {tweets_labels_df.count()} rows.")

        # Step 2.3: Split data into train, val, and test
        logger.info("Splitting data into train, val, and test...")
        train_df = tweets_labels_df.filter(col("split") == "train").dropDuplicates()
        val_df = tweets_labels_df.filter(col("split") == "val").dropDuplicates()
        test_df = tweets_labels_df.filter(col("split") == "test").dropDuplicates()
        logger.info(f"Train set: {train_df.count()} rows, Validation set: {val_df.count()} rows, Test set: {test_df.count()} rows.")
        
        # Step 2.4: Select 5 random tweets per user in training data
        logger.info("Selecting up to 5 random tweets per user in the training data...")
        train_sampled_df = train_df.groupBy("user_id").applyInPandas(
            lambda pdf: pdf.sample(n=min(len(pdf), 5), random_state=42),
            schema=train_df.schema
        )
        logger.info(f"Sampled train set contains {train_sampled_df.count()} rows.")
        
        # Step 2.5: Save data to Parquet
        logger.info("Saving splits to Parquet files...")
        save_to_parquet(train_sampled_df, train_file)
        save_to_parquet(val_df, val_file)
        save_to_parquet(test_df, test_file)

        logger.info(f"Appended train data to: {train_file}")
        logger.info(f"Appended val data to: {val_file}")
        logger.info(f"Appended test data to: {test_file}")
        
    else:
        logger.warning(f"Tweet file {tweet_file_path} does not exist.")

# Step 3: Validate saved files
logger.info("Validating saved files...")
for file_path, split_name in zip([train_file, val_file, test_file], ["Train", "Validation", "Test"]):
    split_df = spark.read.parquet(file_path)
    logger.info(f"{split_name} data: {split_df.count()} rows.")
    split_df.show(5)


2024-11-23 23:07:18,492 - INFO - Reading labels and split data...
2024-11-23 23:07:21,921 - INFO - Loaded labels data with 1000000 rows and split data with 1000000 rows.
2024-11-23 23:07:21,922 - INFO - Processing tweet file: /project/swabhas_1457/Section_20243_30249_26/Data/Tweets/tweet_7.json
2024-11-23 23:11:16,639 - INFO - Loaded tweets data with 6726905 rows from /project/swabhas_1457/Section_20243_30249_26/Data/Tweets/tweet_7.json.
2024-11-23 23:12:53,789 - INFO - Joined dataset contains 6726905 rows.          
2024-11-23 23:12:53,789 - INFO - Splitting data into train, val, and test...
2024-11-23 23:18:02,809 - INFO - Train set: 5273729 rows, Validation set: 940312 rows, Test set: 478005 rows.
2024-11-23 23:18:02,810 - INFO - Selecting up to 5 random tweets per user in the training data...
2024-11-23 23:20:55,767 - INFO - Sampled train set contains 997121 rows.        
2024-11-23 23:20:55,768 - INFO - Saving splits to Parquet files...
2024-11-23 23:20:55,768 - INFO - /project/sw

+--------------------+--------------------+-----+-----+
|             user_id|                text|label|split|
+--------------------+--------------------+-----+-----+
|         u1000012406|@Lesbian_Moses @_...|human|train|
|u1000087449933107200|@PWilliams101 @se...|human|train|
|u1000130044897918977|RT @cricketcrocke...|human|train|
|u1000130044897918977|RT @GlennsTheorem...|human|train|
|u1000130044897918977|@ike_onwuka Thank...|human|train|
+--------------------+--------------------+-----+-----+
only showing top 5 rows



2024-11-23 23:44:13,774 - INFO - Validation data: 9981966 rows.                 
                                                                                

+--------------------+--------------------+-----+-----+
|             user_id|                text|label|split|
+--------------------+--------------------+-----+-----+
|u1000709400435216389|RT @AmazingInnova...|human|  val|
|u1000709400435216389|RT @scienceClub01...|human|  val|
|u1000709400435216389|RT @AmazingNature...|human|  val|
|u1000709400435216389|RT @amazing_physi...|human|  val|
|u1000709400435216389|RT @MachinePix: T...|human|  val|
+--------------------+--------------------+-----+-----+
only showing top 5 rows



2024-11-23 23:44:15,621 - INFO - Test data: 5954349 rows.                       


+----------+--------------------+-----+-----+
|   user_id|                text|label|split|
+----------+--------------------+-----+-----+
|u100301671|@meganakpeters @2...|human| test|
|u100301671|@meganakpeters Th...|human| test|
|u100301671|@Fitzy_Red @chipn...|human| test|
|u100301671|RT @KordingLab: A...|human| test|
|u100301671|  Working on teyf :)|human| test|
+----------+--------------------+-----+-----+
only showing top 5 rows



In [7]:
import sys
print(sys.version)

3.12.7 | packaged by conda-forge | (main, Oct  4 2024, 16:05:46) [GCC 13.3.0]
