In [12]:
# import neccessary library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import KNNImputer 
from sklearn.feature_selection import RFE, SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LinearRegression
import os 
import warnings
warnings.filterwarnings('ignore')

In [13]:
# Extracting Data from a Single Excel File Containing Multiple Stock Index Information
def data_cleaning(original_file_name, saved_file_name):
    index_data_dir = os.path.join(os.getcwd(), 'data', 'feature_data', 'unfinish data')
    original_file = pd.read_excel(os.path.join(index_data_dir, original_file_name))

    # Secid: [RUT:102434 DJX: 102456 SPX: 108105]
    filtered_data = original_file[original_file['Security ID'] == 102434]
    filtered_data['The Date of this Option Price'] = filtered_data['The Date of this Option Price'].dt.strftime('%-d/%-m/%Y')

    # Define the path for the new Excel file
    output_excel_path = os.path.join(index_data_dir, saved_file_name)

    # Write the filtered data to a new Excel file
    filtered_data.to_excel(output_excel_path, index=False)


In [14]:
def data_clearning_for_option_v1(original_file_name, saved_file_name):
    index_data_dir = os.path.join(os.getcwd(), 'data', 'feature_data', 'unfinish data')
    original_file = pd.read_excel(os.path.join(index_data_dir, original_file_name))
    
    # C=Call, P=Put ID
    filtered_data = original_file[original_file['C=Call, P=Put'] == 'P']

    # Define the path for the new Excel file
    output_excel_path = os.path.join(index_data_dir, saved_file_name)
    
    # Write the filtered data to a new Excel file
    filtered_data.to_excel(output_excel_path, index=False)

In [15]:
# data cleaning for OptionMetrics - Standardized Options (CALL)
def data_clearning_for_option_v2(original_file_name, saved_file_name):
    index_data_dir = os.path.join(os.getcwd(), 'data', 'feature_data', 'unfinish data')
    original_file = pd.read_excel(os.path.join(index_data_dir, original_file_name))

    # Secid: [RUT:102434 DJX: 102456 SPX: 108105]
    filtered_data = original_file[original_file['Security ID'] == 108105]
    filtered_data['The Date of this Option Price'] = filtered_data['The Date of this Option Price'].dt.strftime('%-d/%-m/%Y')

    # Define the path for the new Excel file
    output_excel_path = os.path.join(index_data_dir, saved_file_name)

    # Write the filtered data to a new Excel file
    filtered_data.to_excel(output_excel_path, index=False)

In [16]:
def data_cleaning_withZeroContent(original_file_name, saved_file_name):
    index_data_dir = os.path.join(os.getcwd(), 'data', 'feature_data', 'unfinish data')
    
    # Read the original Excel file
    original_file = pd.read_excel(os.path.join(index_data_dir, original_file_name))

    # Filter rows where 'Interpolated Premium for the Option' column is 0
    filtered_data = original_file[original_file['Strike Price, Currently Always Equal to Forward Price'] != 0]

    # Define the path for the new Excel file
    output_excel_path = os.path.join(index_data_dir, 'temp', saved_file_name)

    # Write the filtered data to a new Excel file
    filtered_data.to_excel(output_excel_path, index=False)

In [17]:
# data cleaning for OptionMetrics - Security Prices
# Extracting Data from a Single Excel File Containing Multiple Stock Index Information
def data_clearning_for_security_price(original_file_name, saved_file_name):
    index_data_dir = os.path.join(os.getcwd(), 'data', 'index_data', 'unfinish data')
    original_file = pd.read_excel(os.path.join(index_data_dir, original_file_name))

    # Secid: [RUT:102434 DJX: 102456 SPX: 108105]
    filtered_data = original_file[original_file['Security ID'] == 108105]
    filtered_data['The Date for this Price Record'] = filtered_data['The Date for this Price Record'].dt.strftime('%-d/%-m/%Y')

    # Define the path for the new Excel file
    output_excel_path = os.path.join(index_data_dir, saved_file_name)

    # Write the filtered data to a new Excel file
    filtered_data.to_excel(output_excel_path, index=False)