# Data Preprocessing

### Importing data

In [56]:
import pandas as pd

# Load the datasets
df= pd.read_csv('Dataset/Tasla_Stock_Updated_V2.csv')

# Check the first few rows to confirm successful import
print("Dataset Head:")
print(df.head())

print("Dataset Shape:")
print(df.shape[0])

Dataset Head:
   Unnamed: 0        Date       Open       High        Low      Close  \
0           0  2015-01-02  14.858000  14.883333  14.217333  14.620667   
1           1  2015-01-05  14.303333  14.433333  13.810667  14.006000   
2           2  2015-01-06  14.004000  14.280000  13.614000  14.085333   
3           3  2015-01-07  14.223333  14.318667  13.985333  14.063333   
4           4  2015-01-08  14.187333  14.253333  14.000667  14.041333   

     Volume  
0  71466000  
1  80527500  
2  93928500  
3  44526000  
4  51637500  
Dataset Shape:
2274


### Checking null values

In [57]:
missing_values = df.isnull().sum()
print(f"\nMissing Values count: {missing_values}")

total_missing_values = missing_values.sum()
print(f"\nTotal Missing Values: {total_missing_values}")


Missing Values count: Unnamed: 0    0
Date          0
Open          0
High          0
Low           0
Close         0
Volume        0
dtype: int64

Total Missing Values: 0


### Converting date to datetime format and selecting as index

In [58]:
df['Date'] = pd.to_datetime(df['Date'])

df.set_index('Date', inplace=True)

print("\nDataset after setting Date as index:")
print(df.head()) 


Dataset after setting Date as index:
            Unnamed: 0       Open       High        Low      Close    Volume
Date                                                                        
2015-01-02           0  14.858000  14.883333  14.217333  14.620667  71466000
2015-01-05           1  14.303333  14.433333  13.810667  14.006000  80527500
2015-01-06           2  14.004000  14.280000  13.614000  14.085333  93928500
2015-01-07           3  14.223333  14.318667  13.985333  14.063333  44526000
2015-01-08           4  14.187333  14.253333  14.000667  14.041333  51637500


### Splitting dataset into train and test set

In [59]:
split_date = '2021-01-01'

df_train_raw = df[df.index < split_date].copy()
df_test_raw = df[df.index >= split_date].copy()

### Generalized function for feature engineering

In [60]:
def create_features(df):
    df['Monthly_Return'] = df['Close'].pct_change() * 100
    df['MA5'] = df['Close'].rolling(window=5).mean()
    df['MA10'] = df['Close'].rolling(window=10).mean()
    df['MA20'] = df['Close'].rolling(window=20).mean()
    df['Volatility_5'] = df['Close'].rolling(window=5).std()
    df['Volatility_10'] = df['Close'].rolling(window=10).std()
    df['Volatility_20'] = df['Close'].rolling(window=20).std()
    
    df.dropna(inplace=True) 
    return df

### Creating features for train and test set

In [61]:
df_train = create_features(df_train_raw)
df_test = create_features(df_test_raw)

print("Train shape:", df_train.shape)
print("Test shape:", df_test.shape)
print("Train head:\n", df_train.head())
print("Test head:\n", df_test.head())

Train shape: (1492, 13)
Test shape: (744, 13)
Train head:
             Unnamed: 0       Open       High        Low      Close    Volume  \
Date                                                                           
2015-01-30          19  13.597333  13.831333  13.533333  13.573333  45105000   
2015-02-02          20  13.598000  14.130000  13.553333  14.062667  62238000   
2015-02-03          21  14.214667  14.691333  14.084667  14.557333  72393000   
2015-02-04          22  14.552667  14.765333  14.453333  14.570000  49581000   
2015-02-05          23  14.658667  15.032000  14.642667  14.732667  52843500   

            Monthly_Return        MA5       MA10       MA20  Volatility_5  \
Date                                                                        
2015-01-30       -0.779733  13.609333  13.367867  13.550367      0.192533   
2015-02-02        3.605114  13.667867  13.487000  13.522467      0.278766   
2015-02-03        3.517584  13.832933  13.663200  13.550033      0.49031

In [63]:
from sklearn.preprocessing import MinMaxScaler

def scale_features_minmax(df_train, df_test, feature_cols):
    """
    Applies MinMax scaling to the given feature columns.
    Scaler is fit on train data only to prevent data leakage.

    Parameters:
        df_train (pd.DataFrame): Training dataset
        df_test (pd.DataFrame): Testing dataset
        feature_cols (list): List of column names to scale

    Returns:
        df_train_scaled (pd.DataFrame): Scaled training dataset
        df_test_scaled (pd.DataFrame): Scaled testing dataset
        scaler (MinMaxScaler): The fitted scaler object
    """
    scaler = MinMaxScaler()
    
    # Copy to preserve original data
    df_train_scaled = df_train.copy()
    df_test_scaled = df_test.copy()

    # Fit scaler on training data, transform both
    df_train_scaled[feature_cols] = scaler.fit_transform(df_train[feature_cols])
    df_test_scaled[feature_cols] = scaler.transform(df_test[feature_cols]) #only transoform to avoid data leakage

    return df_train_scaled, df_test_scaled, scaler


# Define the feature columns to scale
features_to_scale = ['Close', 'Monthly_Return', 'MA5', 'MA10', 'MA20',
                     'Volatility_5', 'Volatility_10', 'Volatility_20']

# Call the function
df_train_scaled, df_test_scaled, fitted_scaler = scale_features_minmax(df_train, df_test, features_to_scale)
print("Scaled Train shape:", df_train_scaled.shape)
print("Scaled Test shape:", df_test_scaled.shape)

Scaled Train shape: (1492, 13)
Scaled Test shape: (744, 13)
