In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import pickle  # For saving serialized data

# Corrected file path (adjust the path as needed)
file_path = r"C:\Users\ADMIN\Desktop\Needed\ALGO\ALGO.csv"

# Load the data
data = pd.read_csv(file_path, header=None)

# Function to preprocess data
def preprocess_data(data):
    sequences = []
    sequence = []

    for row in data.values:
        for entry in row:
            if pd.isna(entry):
                continue
            if entry == "(0,0,0'END')":
                if sequence:
                    sequences.append(sequence)
                    sequence = []
            else:
                try:
                    entry = entry.strip("()")
                    parts = entry.split(",")
                    if len(parts) < 4:
                        print(f"Skipping malformed entry: {entry}")
                        continue
                    # Parse Date as a string in the format Day/Month/Year
                    date = parts[0].strip("'")
                    range_val = float(parts[1])
                    time_val = float(parts[2])
                    news = int(parts[3])

                    # Only add valid entries
                    if not np.isnan(range_val) and not np.isnan(time_val):
                        sequence.append([date, range_val, time_val, news])
                except ValueError as e:
                    print(f"Error processing entry: {entry} -> {e}")
                    continue

    if sequence:
        sequences.append(sequence)  # Add any remaining sequence

    return sequences

# Process the data
sequences = preprocess_data(data)

# Flatten sequences and normalize continuous features (range and time)
all_data = [entry for seq in sequences for entry in seq]
all_data = np.array(all_data)

# Create a DataFrame for easier handling
df_normalized = pd.DataFrame(all_data, columns=['B/S', 'range', 'time', 'news'])

# Normalize the continuous features
scaler = MinMaxScaler()
df_normalized[['range', 'time']] = scaler.fit_transform(df_normalized[['range', 'time']])

# Sliding Window Approach with 'END' handling
X = []
y = []

# Reset sliding window start after encountering 'END'
start_index = 0

while start_index + 5 < len(df_normalized):
    # Check if 'END' is in the current window or if the 6th row is 'END'
    if 'END' in df_normalized.iloc[start_index:start_index + 5]['B/S'].values or df_normalized.iloc[start_index + 5]['B/S'] == 'END':
        start_index += 1  # Skip this window entirely
        continue

    # Create sliding window features and labels
    current_window = df_normalized.iloc[start_index:start_index + 5]
    next_row = df_normalized.iloc[start_index + 5]

    # Append all columns including 'B/S' as features
    features = current_window[['B/S', 'range', 'time', 'news']].values
    X.append(features)
    y.append(next_row['B/S'])

    # Move the sliding window
    start_index += 1

# Convert X and y into numpy arrays
X = np.array(X)
y = np.array(y)

print(f"Features (X) shape: {X.shape}")
print(f"Labels (y) shape: {y.shape}")
print(X[:5])
print(y[:5])
# Save preprocessed data
def save_data(X, y, output_dir=r"C:\Users\ADMIN\Desktop\Needed\ALGO"):
    np.save(f"{output_dir}\\X.npy", X)
    np.save(f"{output_dir}\\y.npy", y)

    # Save as CSV for inspection
    pd.DataFrame(X.reshape(X.shape[0], -1)).to_csv(f"{output_dir}\\X.csv", index=False, header=False)
    pd.DataFrame(y).to_csv(f"{output_dir}\\y.csv", index=False, header=False)
    
    # Save as Pickle
    with open(f"{output_dir}\\preprocessed_data.pkl", "wb") as f:
        pickle.dump((X, y), f)

    print(f"Data saved in {output_dir}")

# Call the save function
save_data(X, y)

Skipping malformed entry: DATE
Error processing entry: Data(buy(1)/sell(-1),range(pips),time(minutes),news(yes||no as 1||0 -> could not convert string to float: 'range(pips)'
Error processing entry: Data(buy(1)/sell(-1),range(pips),time(minutes),news(yes||no as 1||0 -> could not convert string to float: 'range(pips)'
Error processing entry: Data(buy(1)/sell(-1),range(pips),time(minutes),news(yes||no as 1||0 -> could not convert string to float: 'range(pips)'
Error processing entry: Data(buy(1)/sell(-1),range(pips),time(minutes),news(yes||no as 1||0 -> could not convert string to float: 'range(pips)'
Error processing entry: Data(buy(1)/sell(-1),range(pips),time(minutes),news(yes||no as 1||0 -> could not convert string to float: 'range(pips)'
Error processing entry: Data(buy(1)/sell(-1),range(pips),time(minutes),news(yes||no as 1||0 -> could not convert string to float: 'range(pips)'
Error processing entry: Data(buy(1)/sell(-1),range(pips),time(minutes),news(yes||no as 1||0 -> could not 