### This file contains functions related to loading, cleaning, and preproessing data.


The link to where the data was dowloanded from : https://www.cryptodatadownload.com/data/poloniex/#google_vignette

In [None]:
# This code loads the data from a path

import pandas as pd

def load_data(csv_path):
    # If the first row of your CSV is just a note (e.g. “Data provided by...”),
    # then skip it with skiprows=1. If not, set skiprows=0 or remove that parameter.
    
    df = pd.read_csv(
        csv_path,
        skiprows=1,            # Adjust if your first row is already column headers
        parse_dates=['date'],  # Tells Pandas to convert the 'date' column to datetime64 objects.
        infer_datetime_format=True  #Lets Pandas guess the date format more efficiently, which can speed up parsing if your file is big.

    )
    
    #Sets the "date" column as the index of the DataFrame.
    df.set_index('date', inplace=True)
    
    #sorts the DataFrame by the date/time index, just in case the rows were out of order.
    df.sort_index(inplace=True)
    
    return df



In [None]:
#Use of the load_data function 
path_to_csv = r"C:\Users\ADMIN\Desktop\Coding_projects\stock_market_prediction\Stock-Market-Prediction\Poloniex_BTCUSDT_1h.csv"
df_prices = load_data(path_to_csv)
    
print("DataFrame shape:", df_prices.shape)
print(df_prices.head())
print(df_prices.tail())

series models:

Date/Time : Usually kept as an index.

Open, High, Low, Close (OHLC) : Core price features for any trading or forecasting model.

Volume : Overall volume is often a good indicator of market interest and liquidity.

Trade Count (Optional) : Helps distinguish whether volume came from many small trades or fewer large trades.


In [16]:
# This code Keep only the columns that are most useful for ur dataframe

def keep_important_columns(df):

    # Define the columns you want to keep
    columns_to_keep = [
        "open",
        "high",
        "low",
        "close",
        "Volume BTC",   
        "tradeCount"   
    ]
    
    # Intersect with what actually exists in your DataFrame to avoid KeyErrors
    existing_cols = [col for col in columns_to_keep if col in df.columns]
    
    # Create a reduced DataFrame with only these columns
    df_reduced = df[existing_cols].copy()
    return df_reduced


In [None]:
# use of keep_important_columns
df_prices = keep_important_columns(df_prices)
print(df_prices)

In [None]:
# Save the DataFrame to CSV
#df_prices.to_csv('modified_btc_data.csv')


Below is a data-quality checker , It identifies common issues in a crypto price DataFrame, such as:

Missing Values: Looks for NaN/None in columns.

Duplicate Rows: Checks whether any exact duplicates exist.

Negative or Zero Price/Volume: Flags rows where prices or volumes are invalid.

Out-of-Order Date Index: Ensures that your date index is strictly increasing (important for time-series).

Unexpected Data Types: Verifies that “open”, “high”, “low”, “close”, and “volume” columns are numeric.

In [20]:
# Data Quality Checker

import numpy as np

def check_data_problems(df):

    problems_summary = {}

    # 1) Check for missing values
    missing = df.isnull().sum()
    has_missing = missing.any()
    if has_missing:
        print(">> MISSING VALUES found per column:")
        print(missing[missing > 0])
        problems_summary['missing_values'] = missing[missing > 0].to_dict()
    else:
        print("No missing values detected.")
        problems_summary['missing_values'] = {}
    
    # 2) Check for duplicate rows
    duplicate_count = df.duplicated().sum()
    if duplicate_count > 0:
        print(f">> DUPLICATE ROWS found: {duplicate_count}")
        problems_summary['duplicate_rows'] = duplicate_count
    else:
        print("No duplicate rows found.")
        problems_summary['duplicate_rows'] = 0
    
    # 3) Negative or zero price/volume checks
    # Adjust these columns to match your DataFrame (e.g., 'Volume BTC', 'Volume USDT', etc.)
    price_cols = [col for col in ['open','high','low','close'] if col in df.columns]
    volume_cols = [col for col in ['Volume BTC','Volume USDT','Volume USD','volume'] if col in df.columns]

    invalid_prices = {}
    for col in price_cols:
        # Count how many rows have a non-positive price
        non_pos = (df[col] <= 0).sum()
        if non_pos > 0:
            invalid_prices[col] = non_pos
    
    invalid_volumes = {}
    for col in volume_cols:
        non_pos = (df[col] <= 0).sum()
        if non_pos > 0:
            invalid_volumes[col] = non_pos

    if invalid_prices:
        print(">> INVALID (≤0) PRICE VALUES found:")
        for c, count in invalid_prices.items():
            print(f"   Column '{c}': {count} rows")
        problems_summary['invalid_prices'] = invalid_prices
    else:
        print("No invalid (zero/negative) price values found.")
        problems_summary['invalid_prices'] = {}
    
    if invalid_volumes:
        print(">> INVALID (≤0) VOLUME VALUES found:")
        for c, count in invalid_volumes.items():
            print(f"   Column '{c}': {count} rows")
        problems_summary['invalid_volumes'] = invalid_volumes
    else:
        print("No invalid (zero/negative) volume values found.")
        problems_summary['invalid_volumes'] = {}

    # 4) Out-of-order date index check (only if index is datetime-like)
    if isinstance(df.index, pd.DatetimeIndex):
        # Check if the index is strictly increasing
        is_increasing = df.index.is_monotonic_increasing
        if not is_increasing:
            print(">> The date index is NOT strictly increasing. Some timestamps may be out of order.")
            problems_summary['date_order'] = "Not strictly increasing"
        else:
            print("Date index is in ascending order (strictly increasing).")
            problems_summary['date_order'] = "Ascending"
    else:
        print("Index is not a DatetimeIndex (skipping date-order check).")
        problems_summary['date_order'] = None

    # 5) Data-type checks for numeric columns
    numeric_checks = {}
    for col in price_cols + volume_cols:
        if col in df.columns:
            if not pd.api.types.is_numeric_dtype(df[col]):
                numeric_checks[col] = "Non-numeric type"
    if numeric_checks:
        print(">> NON-NUMERIC COLUMNS found (expected numeric):")
        for c, msg in numeric_checks.items():
            print(f"   Column '{c}' => {msg}")
    else:
        print("All price/volume columns have numeric types.")
    problems_summary['non_numeric_columns'] = numeric_checks

    print("\n=== DATA QUALITY CHECK COMPLETE ===\n")
    return problems_summary


In [None]:
# 3. Run the data-quality checker 
problems_report = check_data_problems(df_prices)

# 4. If you want to do something programmatic with the results:
print("Problems Summary (as dict):")
print(problems_report)

'''problems found :
No missing values detected.
>> DUPLICATE ROWS found: 3357
No invalid (zero/negative) price values found.
>> INVALID (≤0) VOLUME VALUES found:
   Column 'Volume BTC': 4458 rows
Date index is in ascending order (strictly increasing).
All price/volume columns have numeric types.

=== DATA QUALITY CHECK COMPLETE ===

Problems Summary (as dict):
{'missing_values': {}, 'duplicate_rows': 3357, 'invalid_prices': {}, 'invalid_volumes': {'Volume BTC': 4458}, 'date_order': 'Ascending', 'non_numeric_columns': {}}
'''

### Deal with Duplicate Rows

In [22]:
# This will return every row that has a duplicate somewhere else
duplicates_all = df[df.duplicated(keep=False)]

print(f"Total rows considered duplicates (including the first occurrence): {len(duplicates_all)}")
duplicates_all.head(10)  # display a few rows


Total rows considered duplicates (including the first occurrence): 0


Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
