# Load Twitter Dataset from GitHub
This notebook was created to extract Tweets from Part 38 folder in GitHub to a csv file

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import gzip
import shutil
import pandas as pd
from datetime import datetime

# Define directories
data_url = "https://github.com/sinking8/usc-x-24-us-election/tree/main"
base_folder = "/content/drive/MyDrive/MRP_Offensive_Content_Detection/Twitter_Data/"

def process_data():

    for part_number in range(38, 39):
        part_folder = os.path.join(base_folder, f"Part_{part_number}")
        os.makedirs(part_folder, exist_ok=True)

        all_chunks = []
        chunk_start = 21      # chunk_start
        chunk_end = 24        # chunk_end

        for i in range(chunk_start, chunk_end + 1):
            file_url = f"https://github.com/sinking8/usc-x-24-us-election/raw/main/part_{part_number}/october_chunk_{i}.csv.gz"
            compressed_file = os.path.join(part_folder, f"october_chunk_{i}.csv.gz")
            extracted_file = os.path.join(part_folder, f"october_chunk_{i}.csv")

            # Download the file
            print(f"Downloading chunk {i} for Part {part_number}...")
            os.system(f"wget -q {file_url} -O {compressed_file}")

            # Extract the .gz file
            print(f"Extracting chunk {i} for Part {part_number}...")
            try:
                with gzip.open(compressed_file, 'rb') as f_in:
                    with open(extracted_file, 'wb') as f_out:
                        shutil.copyfileobj(f_in, f_out)

                # Read the extracted CSV file
                print(f"Reading chunk {i} for Part {part_number}...")
                df = pd.read_csv(extracted_file)

                # Convert epoch to date
                if "epoch" in df.columns:
                    df["date"] = df["epoch"].apply(lambda x: datetime.utcfromtimestamp(x).strftime('%Y-%m-%d'))

                all_chunks.append(df)
            except Exception as e:
                print(f"Error processing chunk {i} for Part {part_number}: {e}")

        # Combine all chunks into one DataFrame
        if all_chunks:
            combined_df = pd.concat(all_chunks, ignore_index=True)
            processed_file = os.path.join(part_folder, f"Part_{part_number}_pre_processed.csv")

            combined_df.to_csv(processed_file, index=False)
            print(f"Combined file saved: {processed_file}")

# Execute the processing
if __name__ == "__main__":
    process_data()
    print("All parts processed successfully!")


Downloading chunk 21 for Part 38...
Extracting chunk 21 for Part 38...
Reading chunk 21 for Part 38...
Downloading chunk 22 for Part 38...
Extracting chunk 22 for Part 38...
Reading chunk 22 for Part 38...
Downloading chunk 23 for Part 38...
Extracting chunk 23 for Part 38...
Reading chunk 23 for Part 38...
Downloading chunk 24 for Part 38...
Extracting chunk 24 for Part 38...
Reading chunk 24 for Part 38...
Combined file saved: /content/drive/MyDrive/MRP_Offensive_Content_Detection/Twitter_Data/Part_38/Part_38_pre_processed.csv
All parts processed successfully!
