In [None]:
import pandas as pd
import os
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Set folder path
folder_path = '/content/drive/MyDrive/stock_market_data/nyse/csv/'
print("✅ Drive mounted and paths set")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Drive mounted and paths set


In [None]:
def clean_and_process_csv(filepath, filename, date_column='Date'):
    """Processes a single CSV file with error handling and date conversion"""
    try:
        # Read CSV with flexible parsing
        df = pd.read_csv(
            filepath,
            on_bad_lines='warn',
            dtype=str,
            encoding_errors='replace'
        )

        # Track original stats
        initial_rows = len(df)
        print(f"\n📂 Processing: {filename}")
        print(f"   Initial rows: {initial_rows:,}")

        # Check for date column and convert
        if date_column in df.columns:
            # Convert the date column to datetime, coercing errors to NaT
            df[date_column] = pd.to_datetime(df[date_column], format='%d-%m-%Y', errors='coerce')
            # Identify and report invalid dates
            invalid_dates = df[df[date_column].isna()]
            if not invalid_dates.empty:
                print(f"   ⚠️ Found {len(invalid_dates)} invalid dates in column '{date_column}'. They will be set to NaT.")

        else:
            print(f"   ⚠️ Date column '{date_column}' not found in the data.")
            return None

        # Clean the data (create new DataFrame to avoid SettingWithCopyWarning)
        clean_df = df.copy()
        clean_df = clean_df.drop_duplicates().dropna()

        # Calculate cleaning impact
        final_rows = len(clean_df)
        duplicates_removed = initial_rows - final_rows
        pct_removed = 100 * (initial_rows - final_rows) / initial_rows

        print(f"   Removed {duplicates_removed:,} rows ({pct_removed:.1f}%)")
        print(f"   Final rows: {final_rows:,}")

        # Add source filename column (properly using .loc)
        clean_df.loc[:, 'Company Symbol'] = filename.replace('.csv', '')

        return clean_df

    except Exception as e:
        print(f"\n❌ Failed to process {filename}")
        print(f"   Error: {type(e).__name__} - {str(e)}")

        # Additional error details for parsing issues
        if "Error tokenizing data" in str(e):
            error_line = str(e).split('line ')[1].split(',')[0]
            print(f"   Issue detected around line: {error_line}")

            # Show sample of problematic line
            try:
                with open(filepath, 'r') as f:
                    lines = f.readlines()
                    if int(error_line) <= len(lines):
                        print(f"   Sample of line {error_line}: {lines[int(error_line)-1][:100]}...")
            except:
                pass
        return None


In [None]:
all_clean_dfs = []
failed_files = []

print("🏁 Starting file processing...\n")
print("="*50)

for filename in sorted(os.listdir(folder_path)):
    if filename.endswith('.csv'):
        full_path = os.path.join(folder_path, filename)
        result = clean_and_process_csv(full_path, filename)

        if result is not None:
            all_clean_dfs.append(result)
        else:
            failed_files.append(filename)

print("\n" + "="*50)
print(f"✅ Processed {len(all_clean_dfs)} files successfully")
print(f"❌ {len(failed_files)} files had errors: {failed_files}")


🏁 Starting file processing...


📂 Processing: AAC.csv
   Initial rows: 434
   Removed 0 rows (0.0%)
   Final rows: 434

📂 Processing: AAP.csv
   Initial rows: 5,296
   Removed 0 rows (0.0%)
   Final rows: 5,296

📂 Processing: AAT.csv
   Initial rows: 2,999
   Removed 0 rows (0.0%)
   Final rows: 2,999

📂 Processing: AB.csv
   Initial rows: 8,736
   Removed 0 rows (0.0%)
   Final rows: 8,736

📂 Processing: ABC.csv
   Initial rows: 6,973
   Removed 0 rows (0.0%)
   Final rows: 6,973

📂 Processing: ABG.csv
   Initial rows: 5,220
   Removed 0 rows (0.0%)
   Final rows: 5,220

📂 Processing: ABM.csv
   Initial rows: 10,778
   Removed 0 rows (0.0%)
   Final rows: 10,778

📂 Processing: ABR.csv
   Initial rows: 4,704
   Removed 0 rows (0.0%)
   Final rows: 4,704

📂 Processing: ABT.csv
   Initial rows: 10,778
   Removed 0 rows (0.0%)
   Final rows: 10,778

📂 Processing: ACG.csv
   Initial rows: 728
   Removed 727 rows (99.9%)
   Final rows: 1

📂 Processing: ACGL.csv
   Initial rows: 6,860
   Rem

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df.loc[:, 'Company Symbol'] = filename.replace('.csv', '')



📂 Processing: BKN.csv
   Initial rows: 7,509
   Removed 0 rows (0.0%)
   Final rows: 7,509

📂 Processing: BKT.csv
   Initial rows: 8,667
   Removed 0 rows (0.0%)
   Final rows: 8,667

📂 Processing: BLH.csv
   Initial rows: 3,340
   Removed 2,965 rows (88.8%)
   Final rows: 375

📂 Processing: BLK.csv
   Initial rows: 5,838
   Removed 0 rows (0.0%)
   Final rows: 5,838

📂 Processing: BLW.csv
   Initial rows: 4,879
   Removed 0 rows (0.0%)
   Final rows: 4,879

📂 Processing: BLX.csv
   Initial rows: 7,611
   Removed 0 rows (0.0%)
   Final rows: 7,611

📂 Processing: BME.csv
   Initial rows: 4,460
   Removed 0 rows (0.0%)
   Final rows: 4,460

📂 Processing: BMO.csv
   Initial rows: 7,082
   Removed 0 rows (0.0%)
   Final rows: 7,082

📂 Processing: BMS.csv
   Initial rows: 12,742
   Removed 1,019 rows (8.0%)
   Final rows: 11,723

📂 Processing: BNCM.csv
   Initial rows: 6,732
   Removed 16 rows (0.2%)
   Final rows: 6,716

📂 Processing: BNJ.csv
   Initial rows: 3,340
   Removed 3,108 rows (

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df.loc[:, 'Company Symbol'] = filename.replace('.csv', '')


   Removed 0 rows (0.0%)
   Final rows: 8,328

📂 Processing: DST.csv
   Initial rows: 6,826
   Removed 1,184 rows (17.3%)
   Final rows: 5,642

📂 Processing: DSU.csv
   Initial rows: 6,219
   Removed 0 rows (0.0%)
   Final rows: 6,219

📂 Processing: DSX.csv
   Initial rows: 4,463
   Removed 0 rows (0.0%)
   Final rows: 4,463

📂 Processing: DTE.csv
   Initial rows: 13,356
   Removed 0 rows (0.0%)
   Final rows: 13,356

📂 Processing: DTK.csv
   Initial rows: 3,723
   Removed 1,214 rows (32.6%)
   Final rows: 2,509

📂 Processing: DV.csv
   Initial rows: 416
   Removed 0 rows (0.0%)
   Final rows: 416

📂 Processing: DVN.csv
   Initial rows: 9,426
   Removed 0 rows (0.0%)
   Final rows: 9,426

📂 Processing: DX.csv
   Initial rows: 8,780
   Removed 0 rows (0.0%)
   Final rows: 8,780

📂 Processing: DY.csv
   Initial rows: 9,712
   Removed 0 rows (0.0%)
   Final rows: 9,712

📂 Processing: EARN.csv
   Initial rows: 2,423
   Removed 0 rows (0.0%)
   Final rows: 2,423

📂 Processing: EAT.csv
   In

In [None]:
if all_clean_dfs:
    final_df = pd.concat(all_clean_dfs, ignore_index=True)
    print(f"\n🎉 Combined DataFrame shape: {final_df.shape}")
    print("Sample data (first 5 rows):")
    display(final_df.head())

    # Save to CSV
    save_path = '/content/cleaned_combined_data_nyse.csv'
    final_df.to_csv(save_path, index=False)

    # Download
    from google.colab import files
    files.download(save_path)
    print(f"\n📥 File downloaded: 'cleaned_combined_data_nyse.csv'")
else:
    print("\n⚠️ No files were processed successfully")



🎉 Combined DataFrame shape: (6899426, 8)
Sample data (first 5 rows):


Unnamed: 0,Date,Low,Open,Volume,High,Close,Adjusted Close,Company Symbol
0,2021-03-25,9.520000457763672,9.949999809265137,117100,9.949999809265137,9.869999885559082,9.869999885559082,AAC
1,2021-03-26,9.75,9.850000381469728,158300,9.960000038146973,9.960000038146973,9.960000038146973,AAC
2,2021-03-29,9.5,9.699999809265137,53600,9.902000427246094,9.779999732971191,9.779999732971191,AAC
3,2021-03-30,9.800000190734863,9.800000190734863,52300,9.850000381469728,9.829999923706056,9.829999923706056,AAC
4,2021-03-31,9.75,9.75,78600,9.899999618530272,9.899999618530272,9.899999618530272,AAC


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


📥 File downloaded: 'cleaned_combined_data_nyse.csv'


In [None]:
import pandas as pd
import os

from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Load the cleaned CSV files from their respective folders
cleaned_data_folder_1 = '/content/drive/MyDrive/cleaned_data_stock_market/nasdaq/'
cleaned_data_folder_2 = '/content/drive/MyDrive/cleaned_data_stock_market/nyse/'

# Load the cleaned CSV files
df1 = pd.read_csv(os.path.join(cleaned_data_folder_1, 'cleaned_combined_data.csv'))
df2 = pd.read_csv(os.path.join(cleaned_data_folder_2, 'cleaned_combined_data_nyse.csv'))

# Combine the two DataFrames
combined_df = pd.concat([df1, df2], ignore_index=True)
print(f"✅ Combined DataFrame shape: {combined_df.shape}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Combined DataFrame shape: (15521471, 8)


In [None]:
# Ensure the 'Date' column is in datetime format
combined_df['Date'] = pd.to_datetime(combined_df['Date'], errors='coerce')

# Filter for data after the year 2012
filtered_df = combined_df[combined_df['Date'] >= '2015-01-01']

print(f"🎉 Filtered DataFrame shape (after 2012): {filtered_df.shape}")
print("Sample data (first 5 rows after 2012):")
display(filtered_df.head())


  combined_df['Date'] = pd.to_datetime(combined_df['Date'], errors='coerce')


🎉 Filtered DataFrame shape (after 2012): (2908328, 8)
Sample data (first 5 rows after 2012):


Unnamed: 0,Date,Low,Open,Volume,High,Close,Adjusted Close,Company Symbol
2332,2015-01-02,53.07,54.279999,10748600.0,54.599998,53.91,51.079906,AAL
2333,2015-01-05,53.34,54.369999,11565000.0,54.540001,53.880001,51.051483,AAL
2334,2015-01-06,52.130001,54.27,13772200.0,54.43,53.040001,50.255581,AAL
2335,2015-01-07,52.119999,53.380001,10069800.0,53.650002,53.009998,50.227158,AAL
2336,2015-01-08,53.25,53.48,9672100.0,54.279999,53.66,50.843033,AAL


In [None]:
# Save the filtered DataFrame to a new CSV file
filtered_save_path = '/content/filtered_combined_data_after_2015.csv'
filtered_df.to_csv(filtered_save_path, index=False)

# Download the filtered CSV
from google.colab import files
files.download(filtered_save_path)
print(f"\n📥 Filtered file downloaded: 'filtered_combined_data_after_2015.csv'")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


📥 Filtered file downloaded: 'filtered_combined_data_after_2012.csv'
