In [6]:
pip install pandas numpy statsmodels scikit-learn


Collecting scikit-learn
  Downloading scikit_learn-1.5.2-cp312-cp312-win_amd64.whl.metadata (13 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.5.2-cp312-cp312-win_amd64.whl (11.0 MB)
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
    --------------------------------------- 0.3/11.0 MB ? eta -:--:--
   ------- -------------------------------- 2.1/11.0 MB 9.0 MB/s eta 0:00:01
   ----------------- ---------------------- 4.7/11.0 MB 12.4 MB/s eta 0:00:01
   ----------------- ---------------------- 4.7/11.0 MB 12.4 MB/s eta 0:00:01
   ----------------------------------- ---- 9.7/11.0 MB 11.2 MB/s eta 0:00:01
   ---------------------------------------- 11.0/11.0 MB 11.4 MB/s eta 0:00:00
Downloading joblib-1.4.2-py3-none-any.whl (301 kB)
Downloading threadpoolctl


[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [13]:
import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import adfuller
from sklearn.preprocessing import StandardScaler

# Load data
tesla_data = pd.read_csv('Tesla_Close_2019_2024.csv')
sp500_data = pd.read_csv('S&P500_Close_2019_2024.csv')
sales_data = pd.read_csv('Daily_Sales_2019_2024.csv')

# Ensure data has a 'Date' column and set it as index (adjust column names as needed)
for df in [tesla_data, sp500_data, sales_data]:
    df['Date'] = pd.to_datetime(df['Date'])
    df.set_index('Date', inplace=True)

# Task 1: Handle missing values and outliers
def clean_data(df):
    # Interpolate missing values
    df.interpolate(method='time', inplace=True)
    
    # Remove outliers based on Z-score
    z_scores = np.abs((df - df.mean()) / df.std())
    df[z_scores > 3] = np.nan  # Setting outliers as NaN
    df.interpolate(method='time', inplace=True)  # Re-interpolating outliers
    return df

tesla_data = clean_data(tesla_data)
sp500_data = clean_data(sp500_data)
sales_data = clean_data(sales_data)

# Task 2: Normalize trade volume and S&P 500 data (assumes these columns are present in the data)
scaler = StandardScaler()

if 'Volume' in tesla_data.columns:
    tesla_data['Volume'] = scaler.fit_transform(tesla_data[['Volume']])

sp500_data = pd.DataFrame(scaler.fit_transform(sp500_data), columns=sp500_data.columns, index=sp500_data.index)
sales_data = pd.DataFrame(scaler.fit_transform(sales_data), columns=sales_data.columns, index=sales_data.index)

# Task 3: Check stationarity with ADF test
def adf_test(series, name):
    result = adfuller(series.dropna())
    print(f'ADF Test for {name}:')
    print(f'  Test Statistic: {result[0]}')
    print(f'  p-value: {result[1]}')
    print(f'  Critical Values: {result[4]}')
    if result[1] < 0.05:
        print(f'  {name} is stationary')
    else:
        print(f'  {name} is not stationary')

# ADF test for each dataset
adf_test(tesla_data['Close'], 'Tesla Close Price')
adf_test(sp500_data['Close'], 'S&P 500 Close Price')
adf_test(sales_data['Sales'], 'Daily Sales')

# Task 4: Apply differencing if data is not stationary
def make_stationary(df, column_name):
    adf_test(df[column_name], column_name)
    if adfuller(df[column_name].dropna())[1] >= 0.05:
        df[f'{column_name}_diff'] = df[column_name].diff().dropna()
        adf_test(df[f'{column_name}_diff'], f'{column_name} Differenced')
    return df

tesla_data = make_stationary(tesla_data, 'Close')
sp500_data = make_stationary(sp500_data, 'Close')
sales_data = make_stationary(sales_data, 'Sales')

# Save cleaned and transformed data
#tesla_data.to_csv('/path/to/cleaned_Tesla_Close.csv')
#sp500_data.to_csv('/path/to/cleaned_SP500_Close.csv')
#sales_data.to_csv('/path/to/cleaned_Daily_Sales.csv')


ADF Test for Tesla Close Price:
  Test Statistic: -1.8361730014156261
  p-value: 0.3626563685999685
  Critical Values: {'1%': np.float64(-3.434886677803751), '5%': np.float64(-2.8635436366589673), '10%': np.float64(-2.5678367211155533)}
  Tesla Close Price is not stationary
ADF Test for S&P 500 Close Price:
  Test Statistic: -0.6044408180584503
  p-value: 0.8699754232637843
  Critical Values: {'1%': np.float64(-3.434843038681996), '5%': np.float64(-2.8635243777871286), '10%': np.float64(-2.56782646544019)}
  S&P 500 Close Price is not stationary
ADF Test for Daily Sales:
  Test Statistic: -2.3193049996859947
  p-value: 0.16581729431908165
  Critical Values: {'1%': np.float64(-3.4335099516667325), '5%': np.float64(-2.8629359231480236), '10%': np.float64(-2.5675131253354153)}
  Daily Sales is not stationary
ADF Test for Close:
  Test Statistic: -1.8361730014156261
  p-value: 0.3626563685999685
  Critical Values: {'1%': np.float64(-3.434886677803751), '5%': np.float64(-2.8635436366589673)