In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
from numpy import mean
from numpy import std

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

import xgboost as xgb
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

import sklearn.metrics as metrics
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report,roc_auc_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

# extras
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

In [None]:
df=pd.read_csv("/content/drive/MyDrive/Colab Notebooks/stock/clean_stock_final")

In [None]:
df['Consensus'].value_counts()

1.0    9584
0.0    3261
2.0    2137
3.0     128
Name: Consensus, dtype: int64

In [None]:
df=df.drop(columns="Unnamed: 0")

In [None]:
df_testing=df.copy()

In [None]:
df_testing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15254 entries, 0 to 15253
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Ticker        15254 non-null  object 
 1   Price         15125 non-null  float64
 2   Price_Target  14638 non-null  float64
 3   Consensus     15110 non-null  float64
 4   created_date  15254 non-null  object 
dtypes: float64(3), object(2)
memory usage: 596.0+ KB


In [None]:
# convert date column to pandas datetime object
df_testing['created_date'] = pd.to_datetime(df_testing['created_date'], format='%Y-%m-%d')

# convert to Unix timestamp (in seconds)
df_testing['created_date_unix'] = df_testing['created_date'].astype(int) // 10**9

In [None]:
df_testing=df_testing.drop(columns=['created_date'])

In [None]:
df['Ticker'].value_counts().sort_values(ascending=True)

AMAT      5
GE        9
C        13
TRC      15
SYK      40
       ... 
NFLX    148
SBUX    148
SPGI    148
HDB     148
LLY     148
Name: Ticker, Length: 112, dtype: int64

In [None]:
df_testing['Consensus_NA'] = np.where(df_testing['Consensus'].isnull()==True,1,0)

In [None]:
df_testing['Consensus'] = df_testing['Consensus'].fillna(df_testing['Consensus'].median())

**split the df_testing in train and test**

In [None]:
X_train,X_test,y_train,y_test = train_test_split(df_testing.drop(columns=['Consensus']),
                                                 df_testing['Consensus'],
                                                 test_size=0.2,
                                                random_state=98)

In [None]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((12203, 5), (3051, 5), (12203,), (3051,))

**Handling null values**

In [None]:
X_train['Price_NA'] = np.where(X_train['Price'].isnull()==True,1,0)
X_test['Price_NA'] = np.where(X_test['Price'].isnull()==True,1,0)

In [None]:
X_train['Price'] = X_train['Price'].fillna(X_train['Price'].median())
X_test['Price'] = X_test['Price'].fillna(X_test['Price'].median())

In [None]:
X_train['Price_Target_NA'] = np.where(X_train['Price_Target'].isnull()==True,1,0)
X_test['Price_Target_NA'] = np.where(X_test['Price_Target'].isnull()==True,1,0)

In [None]:
X_train['Price_Target'] = X_train['Price_Target'].fillna(X_train['Price_Target'].median())
X_test['Price_Target'] = X_test['Price_Target'].fillna(X_test['Price_Target'].median())

TRANFORMATION

In [None]:
X_train['Price'] = X_train['Price']**(1/2)
X_test['Price']= X_test['Price']**(1/2)

In [None]:
X_train['Price_Target'] = X_train['Price_Target']**(1/2)
X_test['Price_Target']= X_test['Price_Target']**(1/2)

OUTLIER HANDLING

In [None]:
def handlingOuliersIQRExe(feature,X_train,X_test):    
    # Finding the IQR
    X_train_percentile25 = X_train[feature].quantile(0.25)
    X_train_percentile75 = X_train[feature].quantile(0.75)
    
    X_test_percentile25 = X_test[feature].quantile(0.25)
    X_test_percentile75 = X_test[feature].quantile(0.75)
    
    X_train_iqr = X_train_percentile75 - X_train_percentile25
    X_test_iqr = X_test_percentile75 - X_test_percentile25
    
    X_train_upper_limit = X_train_percentile75 + 1.5 * X_train_iqr
    X_train_lower_limit = X_train_percentile25 - 1.5 * X_train_iqr
    
    X_test_upper_limit = X_test_percentile75 + 1.5 * X_test_iqr
    X_test_lower_limit = X_test_percentile25 - 1.5 * X_test_iqr
    
    X_train[feature]= np.where(X_train[feature] > X_train_upper_limit, X_train_upper_limit,
                       np.where(X_train[feature] < X_train_lower_limit, X_train_lower_limit, X_train[feature]))
    X_test[feature]= np.where(X_test[feature] > X_test_upper_limit, X_test_upper_limit,
                       np.where(X_test[feature] < X_test_lower_limit, X_test_lower_limit, X_test[feature]))


In [None]:
OutlierFeature = ["Price","Price_Target"]
for feature in OutlierFeature:
    handlingOuliersIQRExe(feature,X_train,X_test)

for target columns

In [None]:
# OutlierFeature = ["Consensus"]
# for feature in OutlierFeature:
#     handlingOuliersIQRExe(feature,y_train,y_test)

ENCODING

In [None]:
X_train=pd.get_dummies(X_train,columns=['Ticker'],drop_first=True)

In [None]:
X_test=pd.get_dummies(X_test,columns=['Ticker'],drop_first=True)

In [None]:
X_train.shape,X_test.shape

((12203, 117), (3051, 117))

**work on imbalance data**

RandomOverSampler

In [None]:
from imblearn.over_sampling import RandomOverSampler

# Instantiate oversampler
oversampler = RandomOverSampler(random_state=42)

# Resample the data
X_resampled, y_resampled = oversampler.fit_resample(X_train, y_train)


In [None]:
# Combine the feature data and target column into a single DataFrame
df_resampled = pd.concat([pd.DataFrame(X_resampled), pd.DataFrame(y_resampled)], axis=1)

# Calculate the frequency distribution of the target column in the resampled data
freq_dist = df_resampled['Consensus'].value_counts()

print(freq_dist)


0.0    7796
1.0    7796
2.0    7796
3.0    7796
Name: Consensus, dtype: int64


In [None]:
X_resampled.shape, y_resampled.shape

((31184, 117), (31184,))

In [None]:
Imbalanced_model = RandomForestClassifier()
Imbalanced_model.fit(X_resampled, y_resampled)
  
# predict the mode
y_pred = Imbalanced_model.predict(X_test)
  
# performance evaluatio metrics
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

         0.0       0.93      0.92      0.93       650
         1.0       0.97      0.97      0.97      1928
         2.0       0.99      0.98      0.99       439
         3.0       0.91      0.94      0.93        34

    accuracy                           0.96      3051
   macro avg       0.95      0.96      0.95      3051
weighted avg       0.96      0.96      0.96      3051



**under sampling technique**

In [None]:
from imblearn.under_sampling import RandomUnderSampler,InstanceHardnessThreshold,NearMiss

In [None]:
rus = RandomUnderSampler(
    sampling_strategy='auto',
    random_state=92,
    replacement=False
)

X_uresampled, y_uresampled = rus.fit_resample(X_train, y_train)

In [None]:
# Combine the feature data and target column into a single DataFrame
df_uresampled = pd.concat([pd.DataFrame(X_uresampled), pd.DataFrame(y_uresampled)], axis=1)

# Calculate the frequency distribution of the target column in the resampled data
freq_dist1 = df_uresampled['Consensus'].value_counts()

print(freq_dist1)

0.0    93
1.0    93
2.0    93
3.0    93
Name: Consensus, dtype: int64


In [None]:
Imbalanced_model = RandomForestClassifier()
Imbalanced_model.fit(X_uresampled, y_uresampled)
  
# predict the mode
y_pred = Imbalanced_model.predict(X_test)
  
# performance evaluatio metrics
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

         0.0       0.80      0.47      0.59      1098
         1.0       0.62      0.89      0.73      1350
         2.0       0.86      0.69      0.76       541
         3.0       1.00      0.56      0.72        62

    accuracy                           0.70      3051
   macro avg       0.82      0.65      0.70      3051
weighted avg       0.73      0.70      0.69      3051



 minority classes have very low frequency, I am using version=2 in NearMiss

In [None]:
nm2 = NearMiss(
    sampling_strategy='auto',  # undersamples only the majority class
    version=2,
    n_neighbors=3,
    n_jobs=4  # I have 4 cores in my laptop
)

X_Near_resampled, y_Near_resampled = nm2.fit_resample(X_train,y_train)

In [None]:
# Combine the feature data and target column into a single DataFrame
df_Near_resampled = pd.concat([pd.DataFrame(X_Near_resampled), pd.DataFrame(y_Near_resampled)], axis=1)

# Calculate the frequency distribution of the target column in the resampled data
freq_dist_near = df_Near_resampled['Consensus'].value_counts()

print(freq_dist_near)

0.0    93
1.0    93
2.0    93
3.0    93
Name: Consensus, dtype: int64


**SMOTE**

In [None]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(
    sampling_strategy='auto',  # samples only the minority class
    random_state=76,  # for reproducibility
    k_neighbors=5,
    n_jobs=4
)

X_res, y_res = sm.fit_resample(X_train,y_train)

In [None]:
# Combine the feature data and target column into a single DataFrame
df_SMOTE_resampled = pd.concat([pd.DataFrame(X_res), pd.DataFrame(y_res)], axis=1)

# Calculate the frequency distribution of the target column in the resampled data
freq_dist_SMOTE = df_SMOTE_resampled['Consensus'].value_counts()

print(freq_dist_SMOTE)

0.0    7796
1.0    7796
2.0    7796
3.0    7796
Name: Consensus, dtype: int64


In [None]:
Imbalanced_model = RandomForestClassifier()
Imbalanced_model.fit(X_res, y_res)
  
# predict the mode
y_pred1 = Imbalanced_model.predict(X_test)
  
# performance evaluatio metrics
print(classification_report(y_pred1, y_test))

              precision    recall  f1-score   support

         0.0       0.94      0.93      0.93       657
         1.0       0.97      0.98      0.97      1919
         2.0       0.99      0.98      0.98       439
         3.0       0.94      0.92      0.93        36

    accuracy                           0.97      3051
   macro avg       0.96      0.95      0.96      3051
weighted avg       0.97      0.97      0.97      3051



SCALLING is not use yet...

In [None]:
X_train_lg,X_test_lg = X_train.copy(),X_test.copy()

In [None]:
X_train_lg.describe()

Unnamed: 0,Price,Price_Target,created_date_unix,Consensus_NA,Price_NA,Price_Target_NA,Ticker_ABBV,Ticker_ABT,Ticker_ACN,Ticker_ADBE,Ticker_ADP,Ticker_AMAT,Ticker_AMD,Ticker_AMGN,Ticker_AMT,Ticker_AMZN,Ticker_ASML,Ticker_AVGO,Ticker_AXP,Ticker_AZN,Ticker_BA,Ticker_BABA,Ticker_BAC,Ticker_BHP,Ticker_BLK,Ticker_BMY,Ticker_BP,Ticker_BRK.B,Ticker_BTI,Ticker_BUD,Ticker_C,Ticker_CAT,Ticker_CI,Ticker_CMCSA,Ticker_COP,Ticker_COST,Ticker_CRM,Ticker_CSCO,Ticker_CVS,Ticker_CVX,Ticker_DE,Ticker_DEO,Ticker_DHR,Ticker_DIS,Ticker_ELV,Ticker_EQNR,Ticker_GE,Ticker_GILD,Ticker_GOOGL,Ticker_GS,Ticker_HD,Ticker_HDB,Ticker_HON,Ticker_HSBC,Ticker_IBM,Ticker_INTC,Ticker_INTU,Ticker_JNJ,Ticker_JPM,Ticker_KO,Ticker_LIN,Ticker_LLY,Ticker_LMT,Ticker_LOW,Ticker_MA,Ticker_MCD,Ticker_MDT,Ticker_META,Ticker_MRK,Ticker_MS,Ticker_MSFT,Ticker_NEE,Ticker_NFLX,Ticker_NKE,Ticker_NVDA,Ticker_NVO,Ticker_NVS,Ticker_ORCL,Ticker_PBR,Ticker_PDD,Ticker_PEP,Ticker_PFE,Ticker_PG,Ticker_PLD,Ticker_PM,Ticker_PYPL,Ticker_QCOM,Ticker_RIO,Ticker_RTX,Ticker_RY,Ticker_SAP,Ticker_SBUX,Ticker_SCHW,Ticker_SHEL,Ticker_SNY,Ticker_SONY,Ticker_SPGI,Ticker_SYK,Ticker_T,Ticker_TD,Ticker_TM,Ticker_TMO,Ticker_TMUS,Ticker_TRC,Ticker_TSLA,Ticker_TSM,Ticker_TTE,Ticker_TXN,Ticker_UL,Ticker_UNH,Ticker_UNP,Ticker_UPS,Ticker_V,Ticker_VZ,Ticker_WFC,Ticker_WMT,Ticker_XOM
count,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0,12203.0
mean,12.324233,13.385253,1673557000.0,0.009506,0.008604,0.040728,0.009424,0.009424,0.00967,0.009014,0.006228,0.000328,0.009752,0.00926,0.005081,0.009916,0.009834,0.010079,0.009998,0.010243,0.009424,0.009834,0.009424,0.010243,0.009506,0.009424,0.00967,0.009916,0.003442,0.00549,0.000983,0.010325,0.004917,0.009424,0.009752,0.009424,0.009588,0.010079,0.009752,0.010161,0.009752,0.010325,0.009834,0.009588,0.009506,0.007129,0.000574,0.009178,0.009916,0.009916,0.009588,0.010407,0.010161,0.00967,0.009588,0.008932,0.009834,0.009752,0.009752,0.009424,0.009834,0.009752,0.009588,0.010407,0.009506,0.009506,0.008195,0.009014,0.009998,0.009096,0.009834,0.00967,0.009916,0.009998,0.010407,0.009998,0.009916,0.00885,0.004343,0.006638,0.010079,0.009916,0.010079,0.009916,0.010243,0.003196,0.009834,0.00885,0.009998,0.009834,0.009998,0.008686,0.009506,0.010161,0.009752,0.008932,0.00967,0.002704,0.009752,0.00967,0.00926,0.009916,0.00967,0.000901,0.009916,0.009506,0.009506,0.009752,0.009588,0.009096,0.00967,0.009998,0.009588,0.009506,0.009096,0.009752,0.009178
std,4.848139,5.125189,3560564.0,0.097037,0.092364,0.197667,0.096622,0.096622,0.097862,0.094518,0.078675,0.018103,0.098272,0.095786,0.071101,0.099086,0.09868,0.099893,0.099491,0.100694,0.096622,0.09868,0.096622,0.100694,0.097037,0.096622,0.097862,0.099086,0.058568,0.073897,0.031345,0.101092,0.06995,0.096622,0.098272,0.096622,0.097451,0.099893,0.098272,0.100295,0.098272,0.101092,0.09868,0.097451,0.097037,0.084138,0.023945,0.095366,0.099086,0.099086,0.097451,0.101488,0.100295,0.097862,0.097451,0.094091,0.09868,0.098272,0.098272,0.096622,0.09868,0.098272,0.097451,0.101488,0.097037,0.097037,0.090157,0.094518,0.099491,0.094943,0.09868,0.097862,0.099086,0.099491,0.101488,0.099491,0.099086,0.093663,0.065762,0.081205,0.099893,0.099086,0.099893,0.099086,0.100694,0.056445,0.09868,0.093663,0.099491,0.09868,0.099491,0.092799,0.097037,0.100295,0.098272,0.094091,0.097862,0.051934,0.098272,0.097862,0.095786,0.099086,0.097862,0.030011,0.099086,0.097037,0.097037,0.098272,0.097451,0.094943,0.097862,0.099491,0.097451,0.097037,0.094943,0.098272,0.095366
min,3.042203,3.720215,1666051000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,8.899719,9.684007,1670371000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,11.29646,12.339773,1674000000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,15.091222,15.948668,1676333000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,24.378477,25.345658,1679530000.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
sl = StandardScaler()

In [None]:
sl.fit(X_train_lg)

In [None]:
X_train_lg = pd.DataFrame(sl.transform(X_train_lg),columns=X_train_lg.columns)

In [None]:
X_test_lg = pd.DataFrame(sl.transform(X_test_lg),columns=X_test_lg.columns)

In [None]:
X_train_lg.sample(5)

Unnamed: 0,Price,Price_Target,created_date_unix,Consensus_NA,Price_NA,Price_Target_NA,Ticker_ABBV,Ticker_ABT,Ticker_ACN,Ticker_ADBE,Ticker_ADP,Ticker_AMAT,Ticker_AMD,Ticker_AMGN,Ticker_AMT,Ticker_AMZN,Ticker_ASML,Ticker_AVGO,Ticker_AXP,Ticker_AZN,Ticker_BA,Ticker_BABA,Ticker_BAC,Ticker_BHP,Ticker_BLK,Ticker_BMY,Ticker_BP,Ticker_BRK.B,Ticker_BTI,Ticker_BUD,Ticker_C,Ticker_CAT,Ticker_CI,Ticker_CMCSA,Ticker_COP,Ticker_COST,Ticker_CRM,Ticker_CSCO,Ticker_CVS,Ticker_CVX,Ticker_DE,Ticker_DEO,Ticker_DHR,Ticker_DIS,Ticker_ELV,Ticker_EQNR,Ticker_GE,Ticker_GILD,Ticker_GOOGL,Ticker_GS,Ticker_HD,Ticker_HDB,Ticker_HON,Ticker_HSBC,Ticker_IBM,Ticker_INTC,Ticker_INTU,Ticker_JNJ,Ticker_JPM,Ticker_KO,Ticker_LIN,Ticker_LLY,Ticker_LMT,Ticker_LOW,Ticker_MA,Ticker_MCD,Ticker_MDT,Ticker_META,Ticker_MRK,Ticker_MS,Ticker_MSFT,Ticker_NEE,Ticker_NFLX,Ticker_NKE,Ticker_NVDA,Ticker_NVO,Ticker_NVS,Ticker_ORCL,Ticker_PBR,Ticker_PDD,Ticker_PEP,Ticker_PFE,Ticker_PG,Ticker_PLD,Ticker_PM,Ticker_PYPL,Ticker_QCOM,Ticker_RIO,Ticker_RTX,Ticker_RY,Ticker_SAP,Ticker_SBUX,Ticker_SCHW,Ticker_SHEL,Ticker_SNY,Ticker_SONY,Ticker_SPGI,Ticker_SYK,Ticker_T,Ticker_TD,Ticker_TM,Ticker_TMO,Ticker_TMUS,Ticker_TRC,Ticker_TSLA,Ticker_TSM,Ticker_TTE,Ticker_TXN,Ticker_UL,Ticker_UNH,Ticker_UNP,Ticker_UPS,Ticker_V,Ticker_VZ,Ticker_WFC,Ticker_WMT,Ticker_XOM
9853,0.583135,0.494793,0.512719,-0.097965,-0.093162,-0.206051,-0.097538,-0.097538,-0.098814,-0.095374,-0.079164,-0.018108,-0.099236,-0.096678,-0.071461,-0.100074,-0.099656,-0.100906,-0.100491,-0.101732,-0.097538,-0.099656,-0.097538,-0.101732,-0.097965,-0.097538,-0.098814,-0.100074,-0.058768,-0.074302,-0.031374,-0.102142,-0.070293,-0.097538,-0.099236,-0.097538,-0.09839,-0.100906,-0.099236,-0.10132,-0.099236,-0.102142,-0.099656,-0.09839,-0.097965,-0.084738,-0.023957,-0.096245,-0.100074,-0.100074,-0.09839,-0.102551,-0.10132,-0.098814,-0.09839,-0.094935,-0.099656,-0.099236,-0.099236,-0.097538,-0.099656,-0.099236,-0.09839,-0.102551,-0.097965,-0.097965,-0.090898,-0.095374,-0.100491,-0.09581,-0.099656,-0.098814,-0.100074,-0.100491,-0.102551,-0.100491,-0.100074,-0.094495,-0.066046,-0.081744,-0.100906,-0.100074,-0.100906,-0.100074,-0.101732,-0.056623,-0.099656,-0.094495,-0.100491,-0.099656,-0.100491,-0.093608,-0.097965,-0.10132,-0.099236,-0.094935,-0.098814,-0.052073,-0.099236,-0.098814,-0.096678,-0.100074,-0.098814,-0.030037,-0.100074,-0.097965,-0.097965,-0.099236,-0.09839,-0.09581,-0.098814,-0.100491,10.163619,-0.097965,-0.09581,-0.099236,-0.096245
9998,-0.4676,-0.58935,0.270051,-0.097965,-0.093162,-0.206051,-0.097538,-0.097538,-0.098814,-0.095374,-0.079164,-0.018108,-0.099236,-0.096678,-0.071461,-0.100074,-0.099656,-0.100906,-0.100491,-0.101732,-0.097538,-0.099656,-0.097538,-0.101732,-0.097965,-0.097538,-0.098814,-0.100074,-0.058768,-0.074302,-0.031374,-0.102142,-0.070293,-0.097538,-0.099236,-0.097538,-0.09839,-0.100906,-0.099236,-0.10132,-0.099236,-0.102142,-0.099656,-0.09839,-0.097965,-0.084738,-0.023957,-0.096245,-0.100074,-0.100074,-0.09839,-0.102551,-0.10132,-0.098814,-0.09839,-0.094935,-0.099656,-0.099236,-0.099236,-0.097538,-0.099656,-0.099236,-0.09839,-0.102551,-0.097965,-0.097965,-0.090898,-0.095374,-0.100491,-0.09581,-0.099656,-0.098814,-0.100074,-0.100491,-0.102551,-0.100491,-0.100074,-0.094495,-0.066046,-0.081744,-0.100906,-0.100074,-0.100906,-0.100074,9.829751,-0.056623,-0.099656,-0.094495,-0.100491,-0.099656,-0.100491,-0.093608,-0.097965,-0.10132,-0.099236,-0.094935,-0.098814,-0.052073,-0.099236,-0.098814,-0.096678,-0.100074,-0.098814,-0.030037,-0.100074,-0.097965,-0.097965,-0.099236,-0.09839,-0.09581,-0.098814,-0.100491,-0.09839,-0.097965,-0.09581,-0.099236,-0.096245
10687,0.207394,0.037091,-0.530754,-0.097965,-0.093162,-0.206051,-0.097538,-0.097538,-0.098814,-0.095374,-0.079164,-0.018108,-0.099236,-0.096678,-0.071461,-0.100074,-0.099656,-0.100906,-0.100491,-0.101732,-0.097538,-0.099656,-0.097538,-0.101732,-0.097965,-0.097538,-0.098814,-0.100074,-0.058768,-0.074302,-0.031374,-0.102142,-0.070293,-0.097538,-0.099236,-0.097538,-0.09839,-0.100906,-0.099236,-0.10132,-0.099236,-0.102142,-0.099656,-0.09839,-0.097965,-0.084738,-0.023957,-0.096245,-0.100074,-0.100074,-0.09839,-0.102551,-0.10132,-0.098814,-0.09839,-0.094935,-0.099656,10.077014,-0.099236,-0.097538,-0.099656,-0.099236,-0.09839,-0.102551,-0.097965,-0.097965,-0.090898,-0.095374,-0.100491,-0.09581,-0.099656,-0.098814,-0.100074,-0.100491,-0.102551,-0.100491,-0.100074,-0.094495,-0.066046,-0.081744,-0.100906,-0.100074,-0.100906,-0.100074,-0.101732,-0.056623,-0.099656,-0.094495,-0.100491,-0.099656,-0.100491,-0.093608,-0.097965,-0.10132,-0.099236,-0.094935,-0.098814,-0.052073,-0.099236,-0.098814,-0.096678,-0.100074,-0.098814,-0.030037,-0.100074,-0.097965,-0.097965,-0.099236,-0.09839,-0.09581,-0.098814,-0.100491,-0.09839,-0.097965,-0.09581,-0.099236,-0.096245
9988,1.213486,1.08595,-0.894756,-0.097965,-0.093162,-0.206051,-0.097538,-0.097538,-0.098814,-0.095374,-0.079164,-0.018108,-0.099236,-0.096678,-0.071461,-0.100074,-0.099656,-0.100906,-0.100491,-0.101732,-0.097538,-0.099656,-0.097538,-0.101732,-0.097965,-0.097538,-0.098814,-0.100074,-0.058768,-0.074302,-0.031374,-0.102142,-0.070293,-0.097538,-0.099236,-0.097538,-0.09839,-0.100906,-0.099236,-0.10132,-0.099236,-0.102142,-0.099656,-0.09839,-0.097965,-0.084738,-0.023957,-0.096245,-0.100074,-0.100074,-0.09839,-0.102551,-0.10132,-0.098814,-0.09839,-0.094935,-0.099656,-0.099236,-0.099236,-0.097538,10.034524,-0.099236,-0.09839,-0.102551,-0.097965,-0.097965,-0.090898,-0.095374,-0.100491,-0.09581,-0.099656,-0.098814,-0.100074,-0.100491,-0.102551,-0.100491,-0.100074,-0.094495,-0.066046,-0.081744,-0.100906,-0.100074,-0.100906,-0.100074,-0.101732,-0.056623,-0.099656,-0.094495,-0.100491,-0.099656,-0.100491,-0.093608,-0.097965,-0.10132,-0.099236,-0.094935,-0.098814,-0.052073,-0.099236,-0.098814,-0.096678,-0.100074,-0.098814,-0.030037,-0.100074,-0.097965,-0.097965,-0.099236,-0.09839,-0.09581,-0.098814,-0.100491,-0.09839,-0.097965,-0.09581,-0.099236,-0.096245
11067,0.103611,0.082976,0.391385,-0.097965,-0.093162,-0.206051,-0.097538,-0.097538,-0.098814,-0.095374,-0.079164,-0.018108,-0.099236,-0.096678,-0.071461,-0.100074,-0.099656,-0.100906,-0.100491,-0.101732,-0.097538,-0.099656,-0.097538,-0.101732,-0.097965,-0.097538,-0.098814,-0.100074,-0.058768,-0.074302,-0.031374,-0.102142,-0.070293,-0.097538,-0.099236,-0.097538,10.163619,-0.100906,-0.099236,-0.10132,-0.099236,-0.102142,-0.099656,-0.09839,-0.097965,-0.084738,-0.023957,-0.096245,-0.100074,-0.100074,-0.09839,-0.102551,-0.10132,-0.098814,-0.09839,-0.094935,-0.099656,-0.099236,-0.099236,-0.097538,-0.099656,-0.099236,-0.09839,-0.102551,-0.097965,-0.097965,-0.090898,-0.095374,-0.100491,-0.09581,-0.099656,-0.098814,-0.100074,-0.100491,-0.102551,-0.100491,-0.100074,-0.094495,-0.066046,-0.081744,-0.100906,-0.100074,-0.100906,-0.100074,-0.101732,-0.056623,-0.099656,-0.094495,-0.100491,-0.099656,-0.100491,-0.093608,-0.097965,-0.10132,-0.099236,-0.094935,-0.098814,-0.052073,-0.099236,-0.098814,-0.096678,-0.100074,-0.098814,-0.030037,-0.100074,-0.097965,-0.097965,-0.099236,-0.09839,-0.09581,-0.098814,-0.100491,-0.09839,-0.097965,-0.09581,-0.099236,-0.096245


**Model Selection and evaluation** 

In [None]:
model = RandomForestClassifier()
model.fit(X_train, y_train)
  
# predict the mode
y_pred = model.predict(X_test)
  
# performance evaluatio metrics
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

         0.0       0.93      0.97      0.95       651
         1.0       0.99      0.97      0.98      1942
         2.0       0.98      0.99      0.98       438
         3.0       0.91      1.00      0.95        20

    accuracy                           0.97      3051
   macro avg       0.95      0.98      0.97      3051
weighted avg       0.97      0.97      0.97      3051



Random forest with hyperparameter tunning

In [None]:
# hyperparameter
param_grid = {
    'n_estimators': [25, 50, 100, 150],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3, 6, 9],
    'max_leaf_nodes': [3, 6, 9],
}
grid_search = GridSearchCV(RandomForestClassifier(),
                           param_grid=param_grid)
grid_search.fit(X_train, y_train)
print(grid_search.best_estimator_)

RandomForestClassifier(max_depth=9, max_features=None, max_leaf_nodes=9,
                       n_estimators=50)


In [None]:
model_grid = RandomForestClassifier(max_depth=9,
                                    max_leaf_nodes=9,
                                    n_estimators=50)
model_grid.fit(X_train, y_train)
y_pred_grid = model.predict(X_test)
print(classification_report(y_pred_grid, y_test))

              precision    recall  f1-score   support

         1.0       0.94      0.94      0.94       637
         2.0       0.98      0.98      0.98      1938
         3.0       0.98      0.98      0.98       449
         4.0       0.93      1.00      0.96        27

    accuracy                           0.97      3051
   macro avg       0.96      0.97      0.96      3051
weighted avg       0.97      0.97      0.97      3051



Hyperparameter Tuning- RandomizedSearchCV

In [None]:
random_search = RandomizedSearchCV(RandomForestClassifier(),
								param_grid)
random_search.fit(X_train, y_train)
print(random_search.best_estimator_)


RandomForestClassifier(max_depth=6, max_features=None, max_leaf_nodes=9,
                       n_estimators=25)


In [None]:
model_random = RandomForestClassifier(max_depth=6,
                                      max_leaf_nodes=9,
                                      n_estimators=25)
model_random.fit(X_train, y_train)
y_pred_rand = model.predict(X_test)
print(classification_report(y_pred_rand, y_test))

              precision    recall  f1-score   support

         1.0       0.94      0.94      0.94       637
         2.0       0.98      0.98      0.98      1938
         3.0       0.98      0.98      0.98       449
         4.0       0.93      1.00      0.96        27

    accuracy                           0.97      3051
   macro avg       0.96      0.97      0.96      3051
weighted avg       0.97      0.97      0.97      3051



Hyperparameter Tuning- RandomizedSearchCV is give faster result take less time to train model...

**XGBOOST**

In [None]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

# Set up the XGBoost classifier
xgb_model = xgb.XGBClassifier(objective='multi:softmax', reg_alpha=0.1, reg_lambda=0.1)

# Define hyperparameters and their ranges
params = {
    'learning_rate': [0.1, 0.01],
    'n_estimators': [100],
    'max_depth': [3, 5],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Set up grid search
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=5, scoring='accuracy', n_jobs=-1)

# Train the model and find the optimal hyperparameters
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print('Best hyperparameters:', grid_search.best_params_)


Best hyperparameters: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.8}


In [None]:
xgb_model_grid = xgb.XGBClassifier(max_depth=5,
                                    colsample_bytree= 1.0,
                                    learning_rate= 0.1,
                                    subsample= 0.8,
                                    n_estimators=50)
xgb_model_grid.fit(X_train, y_train)
y_pred_grid = xgb_model_grid.predict(X_test)
print(classification_report(y_pred_grid, y_test))

              precision    recall  f1-score   support

           0       0.30      0.96      0.45       199
           1       1.00      0.78      0.88      2492
           2       0.77      0.99      0.87       336
           3       1.00      0.96      0.98        24

    accuracy                           0.82      3051
   macro avg       0.77      0.92      0.79      3051
weighted avg       0.93      0.82      0.85      3051



In [None]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

# Set up the XGBoost classifier
xgb_model1 = xgb.XGBClassifier(objective='multi:softmax', reg_alpha=0.1, reg_lambda=0.1)

# Define hyperparameters and their ranges
params = {
    'learning_rate': [0.1, 0.01],
    'n_estimators': [200],
    'max_depth': [3, 5],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Set up grid search
grid_search1 = GridSearchCV(estimator=xgb_model, param_grid=params, cv=5, scoring='accuracy', n_jobs=-1)

# Train the model and find the optimal hyperparameters
grid_search1.fit(X_train, y_train)

# Print the best hyperparameters
print('Best hyperparameters:', grid_search1.best_params_)


Best hyperparameters: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200, 'subsample': 0.8}


In [None]:
xgb_model_grid1 = xgb.XGBClassifier(max_depth=5,
                                    colsample_bytree= 1.0,
                                    learning_rate= 0.1,
                                    subsample= 0.8,
                                    n_estimators=200)
xgb_model_grid1.fit(X_train, y_train)
y_pred_grid = xgb_model_grid1.predict(X_test)
print(classification_report(y_pred_grid, y_test))

              precision    recall  f1-score   support

           0       0.83      0.89      0.86       600
           1       0.96      0.93      0.95      2023
           2       0.92      0.99      0.95       404
           3       1.00      0.96      0.98        24

    accuracy                           0.93      3051
   macro avg       0.93      0.94      0.93      3051
weighted avg       0.93      0.93      0.93      3051

