# Исследование и моделирование на данных NASDAQ
## Symbol SBUX - Starbacks 

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('SBUX.csv', decimal=',')
df.head()

Unnamed: 0,Date,Open,Close,Low,High,Volume,Volatility,Daily Return,Day_Type,DayOfWeek
0,2020-01-02 00:00:00,78.2149,79.3067,78.1528,79.3067,6473800,0.0148,0.014,Bull,4
1,2020-01-03 00:00:00,78.6942,78.8451,78.1972,79.0848,4874100,0.0113,0.0019,Bull,5
2,2020-01-06 00:00:00,78.206,78.2238,77.638,78.4723,5409800,0.0107,0.0002,Bull,1
3,2020-01-07 00:00:00,78.0729,77.9842,77.3362,78.1351,6514700,0.0102,-0.0011,Bear,2
4,2020-01-08 00:00:00,78.0552,78.8895,77.9131,79.3067,7296300,0.0179,0.0107,Bull,3


In [3]:
df.columns

Index(['Date', 'Open', 'Close', 'Low', 'High', 'Volume', 'Volatility',
       'Daily Return', 'Day_Type', 'DayOfWeek'],
      dtype='object')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1370 entries, 0 to 1369
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Date          1370 non-null   object 
 1   Open          1370 non-null   float64
 2   Close         1370 non-null   float64
 3   Low           1370 non-null   float64
 4   High          1370 non-null   float64
 5   Volume        1370 non-null   int64  
 6   Volatility    1370 non-null   float64
 7   Daily Return  1370 non-null   float64
 8   Day_Type      1370 non-null   object 
 9   DayOfWeek     1370 non-null   int64  
dtypes: float64(6), int64(2), object(2)
memory usage: 107.2+ KB


In [5]:
df['Date'] = pd.to_datetime(df['Date'])
df['Date'].head()

0   2020-01-02
1   2020-01-03
2   2020-01-06
3   2020-01-07
4   2020-01-08
Name: Date, dtype: datetime64[ns]

In [6]:
df = df.sort_values(by='Date', ascending=True)
df['Target'] = np.where(df['Close'].shift(-1)>df['Close'], 1, 0)
df[['Target','Close']].head(20)

Unnamed: 0,Target,Close
0,0,79.3067
1,0,78.8451
2,0,78.2238
3,1,77.9842
4,1,78.8895
5,0,80.354
6,1,80.0256
7,0,80.8422
8,1,80.7623
9,1,81.2505


In [7]:
df['Close_t-1'] = df['Close'].shift(1)
df['Volume_t-1'] = df['Volume'].shift(1)
df['Daily Return_t-1'] = df['Daily Return'].shift(1)

In [8]:
df['SMA_3'] = df['Close'].rolling(window=3).mean()
df['SMA_7'] = df['Close'].rolling(window=7).mean()
df['Diff'] = df['Close'] - df['Open']

In [16]:
df.head(20)

Unnamed: 0,Date,Open,Close,Low,High,Volume,Volatility,Daily Return,Day_Type,DayOfWeek,Target,Close_t-1,Volume_t-1,Daily Return_t-1,SMA_3,SMA_7,Diff
0,2020-01-02,78.2149,79.3067,78.1528,79.3067,6473800,0.0148,0.014,Bull,4,0,,,,,,1.0918
1,2020-01-03,78.6942,78.8451,78.1972,79.0848,4874100,0.0113,0.0019,Bull,5,0,79.3067,6473800.0,0.014,,,0.1509
2,2020-01-06,78.206,78.2238,77.638,78.4723,5409800,0.0107,0.0002,Bull,1,0,78.8451,4874100.0,0.0019,78.791867,,0.0178
3,2020-01-07,78.0729,77.9842,77.3362,78.1351,6514700,0.0102,-0.0011,Bear,2,1,78.2238,5409800.0,0.0002,78.351033,,-0.0887
4,2020-01-08,78.0552,78.8895,77.9131,79.3067,7296300,0.0179,0.0107,Bull,3,1,77.9842,6514700.0,-0.0011,78.365833,,0.8343
5,2020-01-09,80.1055,80.354,79.9457,80.9132,9443500,0.0121,0.0031,Bull,4,0,78.8895,7296300.0,0.0107,79.0759,,0.2485
6,2020-01-10,80.5937,80.0256,79.5552,80.6736,7097900,0.0139,-0.007,Bear,5,1,80.354,9443500.0,0.0031,79.756367,79.089843,-0.5681
7,2020-01-13,80.4339,80.8422,80.07,81.4103,7382500,0.0167,0.0051,Bull,1,0,80.0256,7097900.0,-0.007,80.407267,79.3092,0.4083
8,2020-01-14,80.9399,80.7623,80.647,81.1618,5855200,0.0064,-0.0022,Bear,2,1,80.8422,7382500.0,0.0051,80.543367,79.583086,-0.1776
9,2020-01-15,80.7712,81.2505,80.7712,81.7121,6145300,0.0116,0.0059,Bull,3,1,80.7623,5855200.0,-0.0022,80.951667,80.015471,0.4793


In [9]:
from sklearn.preprocessing import StandardScaler

# Select numerical columns for scaling
numerical_columns = ['Open', 'Close', 'Low', 'High', 'Volume', 'Volatility', 
                     'Daily Return', 'Close_t-1', 'Volume_t-1', 'Daily Return_t-1', 
                     'SMA_3', 'SMA_7', 'Diff']

# Apply StandardScaler
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[numerical_columns])

# Create a new DataFrame with scaled features
scaled_df = pd.DataFrame(scaled_features, columns=numerical_columns)
scaled_df.head()

Unnamed: 0,Open,Close,Low,High,Volume,Volatility,Daily Return,Close_t-1,Volume_t-1,Daily Return_t-1,SMA_3,SMA_7,Diff
0,-0.846329,-0.758749,-0.769701,-0.836864,-0.334218,-0.510575,0.891166,,,,,,0.83526
1,-0.807891,-0.795679,-0.766169,-0.854733,-0.588348,-0.762606,0.102637,-0.758247,-0.334272,0.890714,,,0.10039
2,-0.847043,-0.845386,-0.810656,-0.904056,-0.503246,-0.805811,-0.008148,-0.795165,-0.588314,0.102462,-0.803268,,-0.003565
3,-0.857717,-0.864555,-0.834667,-0.93121,-0.32772,-0.841816,-0.092866,-0.844856,-0.503242,-0.008285,-0.838671,,-0.086745
4,-0.859136,-0.792127,-0.788771,-0.836864,-0.203554,-0.287347,0.676113,-0.864018,-0.327776,-0.092973,-0.837482,,0.634145


In [10]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

# Create a pipeline with StandardScaler and LogisticRegression
log_reg = Pipeline([
    ('scaler', scaler),
    ('logistic_regression', LogisticRegression(random_state=42))
])

In [11]:
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier



In [15]:
from eval_models import eval_classifiers
models = {
    'Logistic Regression': log_reg,
    'Random Forest': RandomForestClassifier(random_state=42),
    'LightGBM': LGBMClassifier(random_state=42),
    'CatBoost': CatBoostClassifier(random_state=42, verbose=0)
}
results = eval_classifiers(models, 'Target', 
          df.drop(columns=['Date', 'Day_Type']), test_size=0.2, random_state=42)

ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values