In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn import set_config; set_config(display='diagram')

In [None]:
key = os.environ.get('API_KEY')

# Get Data

In [2]:
# Step 1: Define a function to read CSV files and convert them to dataframes
# Get the current working directory (where your script is located)
current_directory = os.getcwd()

# Specify the path to the "raw_data" folder
raw_data_folder = os.path.join(current_directory, "raw_data")

# List all CSV files in the "raw_data" folder
csv_files = [os.path.join(raw_data_folder, file) for file in os.listdir(raw_data_folder) if file.endswith(".csv")]

# passing csv files in to a dataframe
dataframes = [pd.read_csv(file) for file in csv_files]

In [3]:
# Step 2: Define a function to read CSV files and convert them to dataframes
def read_csv_to_dataframe(file_path):
    df = pd.read_csv(file_path)
    return df

In [4]:
# Step 4: Read CSV files, preprocess data, and stack them into a 3D tensor
data = []
stock_names =[]

# print(type(csv_files))

for file in csv_files:
    # Read the CSV file into a DataFrame
    data_df = pd.read_csv(file)
    stock_name = file.split('/')[-1].split('.')[0]
    stock_names.append(stock_name)
    data_df["stock"] = stock_name
    data.append(data_df)

# Pipelines

In [5]:
impute_columns = ["sma25", "sma100", "sma200", "rsi", "macd", "signal", "histogram"]

imputer = Pipeline(
    [
        ('imputer', ColumnTransformer(
            transformers=[
                ('impute', KNNImputer(n_neighbors=10), impute_columns),  # Apply imputation to specific columns
            ],
            remainder='passthrough'  # Keep the remaining columns
        ))
    ]
)


imputer

In [6]:
def df_and_column_transform(arr):
    df = pd.DataFrame(arr, columns=["sma25", "sma100", "sma200", "rsi", "macd", "signal", "histogram",\
    "Unnamed: 0", "open", "high","low", "close", "volume", "vwap", "timestamp", "transactions", "otc", "stock"])
    
    first_col = df.pop("sma25")
    df.insert(16, "sma25", first_col)

    sec_col = df.pop("sma100")
    df.insert(16, "sma100", sec_col)

    third_col = df.pop("sma200")
    df.insert(16, "sma200", third_col)

    fourth_col = df.pop("rsi")
    df.insert(16, "rsi", fourth_col)

    fifth_col = df.pop("macd")
    df.insert(16, "macd", fifth_col)
    
    sixth_col = df.pop("signal")
    df.insert(16, "signal", sixth_col)

    seventh_col = df.pop("histogram")
    df.insert(16, "histogram", seventh_col)
    
    return df

transform_pipe = make_pipeline(
    FunctionTransformer(df_and_column_transform)
)

preprocessor = preprocessor = Pipeline(
    [
        ("transformation", transform_pipe),
    ],
)
preprocessor


def timestamp_transform(df):
    df['date'] = pd.to_datetime(df['timestamp'], unit='ms').dt.date
    first_col = df.pop("date")
    df.insert(0, 'date', first_col)
    # df.set_index(keys='date', inplace=True)
    
    return df

time_pipe = make_pipeline(
    FunctionTransformer(timestamp_transform)
)

preprocessor = Pipeline(
    [
        ("transformation", transform_pipe),
        ("timestamp_convertor", time_pipe),
    ],
)
preprocessor

drop_columns = ["Unnamed: 0", 'timestamp', "transactions", "otc"]

def drop(df):
    unwanted_columns = drop_columns
    df = df.drop(columns=unwanted_columns)
    
    return df

drop_pipe = make_pipeline(
    FunctionTransformer(drop)
)

preprocessor = Pipeline(
    [
        ("transformation", transform_pipe),
        ("timestamp_convertor", time_pipe),
        ("unwanted_columns", drop_pipe),
    ],
)
preprocessor

In [7]:
sec_pipe = Pipeline(
    [
        ("imputer", imputer),
        ("preprocessor", preprocessor),
    ]
)

sec_pipe

In [8]:
scaler_columns = ["open", "high", "low", "close", "volume", "vwap", "sma25", "sma100",
       "sma200", "rsi", "macd", "signal", "histogram"]

exclude = ["stock"]

# Create transformers for numeric and object columns
numeric_transformer = Pipeline(
    steps=[
        ('scaler', MinMaxScaler())
    ]
)

object_transformer = Pipeline(steps=[])

# Combine transformers using ColumnTransformer
sec_scaler = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, scaler_columns),
        # ('obj', object_transformer, exclude),
    ],
    remainder='passthrough'  # Keep the remaining columns
)

# Create a final pipeline
scaler = Pipeline(
    steps=[
        ('scaler', sec_scaler)
    ]
)


scaler

In [9]:
penul_pipe = Pipeline(
    [
        ("imputer", imputer),
        ("preprocessor", preprocessor),
        ("scaler", scaler)
    ]
)

penul_pipe

In [10]:
def final_transformation(arr):
    cols = ["open", "high", "low", "close", "volume", "vwap", "sma25", "sma100", "sma200", "rsi", "macd",\
        "signal", "histogram", "date", "stock"]
    df = pd.DataFrame(arr, columns=cols)
    
    first_col = df.pop("date")
    df.insert(0, "date", first_col)
    
    df.set_index(keys='date', inplace=True)
    
    return df

final_transformation_pipe = make_pipeline(
    FunctionTransformer(final_transformation)
)

final_processing = Pipeline(
    [
        ("final_transformation", final_transformation_pipe)
    ]
)

final_processing

In [11]:
final_pipe = Pipeline(
    [
        ("imputer", imputer),
        ("preprocessor", preprocessor),
        ("scaler", scaler),
        ("transformation", final_processing)
    ]
)

final_pipe

In [12]:
final_pipe.fit_transform(data[0])

Unnamed: 0_level_0,open,high,low,close,volume,vwap,sma25,sma100,sma200,rsi,macd,signal,histogram,stock
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2018-08-31,0.444012,0.421922,0.462278,0.473667,0.209187,0.455858,0.456303,0.465804,0.465002,0.540959,0.611511,0.537096,0.616544,CSCO
2018-09-04,0.474197,0.430002,0.485071,0.472367,0.153442,0.460232,0.456303,0.465804,0.465002,0.540959,0.611511,0.537096,0.616544,CSCO
2018-09-05,0.465109,0.415598,0.460833,0.457412,0.169792,0.442263,0.456303,0.465804,0.465002,0.540959,0.611511,0.537096,0.616544,CSCO
2018-09-06,0.461214,0.424381,0.470945,0.457737,0.128526,0.445783,0.456303,0.465804,0.465002,0.540959,0.611511,0.537096,0.616544,CSCO
2018-09-07,0.447257,0.409977,0.464846,0.45026,0.135195,0.439334,0.456303,0.465804,0.465002,0.540959,0.611511,0.537096,0.616544,CSCO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-08-24,0.749432,0.725628,0.732262,0.716515,0.138634,0.71926,0.662749,0.572735,0.513275,0.739923,0.792342,0.633891,0.788116,CSCO
2023-08-25,0.714703,0.704374,0.733546,0.731469,0.120753,0.721001,0.667199,0.575065,0.517136,0.783763,0.797109,0.625552,0.796904,CSCO
2023-08-28,0.735475,0.718777,0.747351,0.747724,0.115529,0.738791,0.671667,0.577695,0.521415,0.827635,0.806973,0.630943,0.80622,CSCO
2023-08-29,0.750406,0.737045,0.761155,0.759428,0.141263,0.753317,0.677407,0.580872,0.525211,0.857198,0.81839,0.638927,0.816318,CSCO


In [13]:
dataframes = []

for df in data:
    preprocessed_df = final_pipe.fit_transform(df)
    
    dataframes.append(preprocessed_df)

In [14]:
filtered_dataframes = []

for df in dataframes:
    if df.shape == (1257, 14):
        filtered_dataframes.append(df)

In [15]:
unequal_shape = []

for df in dataframes:
    if df.shape != (1257, 14):
        unequal_shape.append(df)

In [16]:
# Check
num_dataframes = len(dataframes)
print("Number of dataframes:", num_dataframes)

unequal = len(unequal_shape)
print(f"Length of unequal dataframe: {unequal}")

f = len(filtered_dataframes)
print(f"Length of filtered dataframe: {f}")

Number of dataframes: 100
Length of unequal dataframe: 3
Length of filtered dataframe: 97


In [17]:
# Step 5: Put in tensor
# Stack the dataframes into a 3D tensor
tensor = np.stack(filtered_dataframes, axis=0)

In [18]:
# Check
# Display the shape of the resulting tensor
print("Shape of the 3D tensor:", tensor.shape)

Shape of the 3D tensor: (97, 1257, 14)


In [28]:
tensor[76]

array([[0.0649826766578036, 0.0432785102623352, 0.08078394523122359, ...,
        0.4490223824282578, 0.38774047415727075, 'AVGO'],
       [0.06842048720221311, 0.04459698246567895, 0.08250218135445334,
        ..., 0.4490223824282578, 0.38774047415727075, 'AVGO'],
       [0.07024682405393068, 0.0466494495038739, 0.08579099268407278,
        ..., 0.4490223824282578, 0.38774047415727075, 'AVGO'],
       ...,
       [0.9249187548679936, 0.9197361696343619, 0.9281696758171688, ...,
        0.4826278201679331, 0.27099517839734527, 'AVGO'],
       [0.9276179733032526, 0.9587535680304472, 0.9406670246325256, ...,
        0.5519569608727248, 0.2819850343937384, 'AVGO'],
       [0.9732629656487526, 0.9592904716596439, 0.9711255788979126, ...,
        0.5843711214517824, 0.2964586707409861, 'AVGO']], dtype=object)