# Data Preprocessing

### Importing data

In [31]:
import pandas as pd

# Load the datasets
df= pd.read_csv('Dataset/Tasla_Stock_Updated_V2.csv')

# Check the first few rows to confirm successful import
print("Dataset Head:")
print(df.head())

print("Dataset Shape:")
print(df.shape[0])

Dataset Head:
   Unnamed: 0        Date       Open       High        Low      Close  \
0           0  2015-01-02  14.858000  14.883333  14.217333  14.620667   
1           1  2015-01-05  14.303333  14.433333  13.810667  14.006000   
2           2  2015-01-06  14.004000  14.280000  13.614000  14.085333   
3           3  2015-01-07  14.223333  14.318667  13.985333  14.063333   
4           4  2015-01-08  14.187333  14.253333  14.000667  14.041333   

     Volume  
0  71466000  
1  80527500  
2  93928500  
3  44526000  
4  51637500  
Dataset Shape:
2274


### Checking null values

In [32]:
missing_values = df.isnull().sum()
print(f"\nMissing Values count: {missing_values}")

total_missing_values = missing_values.sum()
print(f"\nTotal Missing Values: {total_missing_values}")


Missing Values count: Unnamed: 0    0
Date          0
Open          0
High          0
Low           0
Close         0
Volume        0
dtype: int64

Total Missing Values: 0


### Converting date to datetime format and selecting as index

In [33]:
df['Date'] = pd.to_datetime(df['Date'])

df.set_index('Date', inplace=True)

print("\nDataset after setting Date as index:")
print(df.head()) 


Dataset after setting Date as index:
            Unnamed: 0       Open       High        Low      Close    Volume
Date                                                                        
2015-01-02           0  14.858000  14.883333  14.217333  14.620667  71466000
2015-01-05           1  14.303333  14.433333  13.810667  14.006000  80527500
2015-01-06           2  14.004000  14.280000  13.614000  14.085333  93928500
2015-01-07           3  14.223333  14.318667  13.985333  14.063333  44526000
2015-01-08           4  14.187333  14.253333  14.000667  14.041333  51637500


### Splitting dataset into train and test set

In [45]:
split_date = '2021-01-01'

df_train_raw = df[df.index < split_date].copy()
df_test_raw = df[df.index >= split_date].copy()

### Generalized function for feature engineering

In [50]:
def create_features(df):
    df['Monthly_Return'] = df['Close'].pct_change() * 100
    df['MA5'] = df['Close'].rolling(window=5).mean()
    df['MA10'] = df['Close'].rolling(window=10).mean()
    df['MA20'] = df['Close'].rolling(window=20).mean()
    df['Volatility_5'] = df['Close'].rolling(window=5).std()
    df['Volatility_10'] = df['Close'].rolling(window=10).std()
    df['Volatility_20'] = df['Close'].rolling(window=20).std()
    
    df.dropna(inplace=True) 
    return df

### Creating features for train and test set

In [54]:
df_train = create_features(df_train_raw)
df_test = create_features(df_test_raw)

print("Train shape:", df_train.shape)
print("Test shape:", df_test.shape)
print("Train head:\n", df_train.head())
print("Test head:\n", df_test.head())

Train shape: (1138, 15)
Test shape: (489, 15)
Train head:
             Unnamed: 0       Open       High        Low      Close     Volume  \
Date                                                                            
2016-06-27         373  12.724000  13.254000  12.524667  13.236667  108081000   
2016-06-28         374  13.459333  13.603333  13.294000  13.452667   93186000   
2016-06-29         375  13.675333  14.118667  13.533333  14.012667   89923500   
2016-06-30         376  14.198000  14.233333  13.934667  14.152000   72646500   
2016-07-01         377  13.742667  14.549333  13.733333  14.433333   81000000   

            Monthly_Return        MA5       MA10       MA20  Volatility_5  \
Date                                                                        
2016-06-27        2.795752  13.391600  13.934200  14.418100      0.710143   
2016-06-28        1.631835  13.154000  13.846400  14.346633      0.211251   
2016-06-29        4.162739  13.334400  13.796334  14.315400      