In [17]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
from numpy import mean
from numpy import std
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.pipeline import Pipeline

from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

import xgboost as xgb
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

import sklearn.metrics as metrics
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report,roc_auc_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

# extras
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
     

In [3]:
from feature_engine.outliers import Winsorizer
from feature_engine.encoding import OneHotEncoder,OrdinalEncoder
from feature_engine.imputation import DropMissingData
from feature_engine.imputation import MeanMedianImputer,AddMissingIndicator,CategoricalImputer
from feature_engine.transformation import LogTransformer

In [4]:
df=pd.read_csv("merge_yahooquery_wallstreet (1)",parse_dates=['created_date'])

In [5]:
df = df.drop(columns=['Unnamed: 0', 'Unnamed: 0.1', 'splits', 'dividends'], axis=1)

In [6]:
df.sample(2)

Unnamed: 0,Ticker,open,low,close,volume,high,adjclose,created_date,Price,Price_Target,Consensus
4161,PM,86.839996,85.970001,86.309998,3480800,87.339996,84.09288,2022-10-18,84.0,105.2,1.0
6351,SBUX,110.040001,108.0,109.150002,9852900,110.830002,108.605797,2023-02-02,109.14,105.26,1.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7223 entries, 0 to 7222
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Ticker        7223 non-null   object        
 1   open          7223 non-null   float64       
 2   low           7223 non-null   float64       
 3   close         7223 non-null   float64       
 4   volume        7223 non-null   int64         
 5   high          7223 non-null   float64       
 6   adjclose      7223 non-null   float64       
 7   created_date  7223 non-null   datetime64[ns]
 8   Price         7223 non-null   float64       
 9   Price_Target  6788 non-null   float64       
 10  Consensus     7217 non-null   float64       
dtypes: datetime64[ns](1), float64(8), int64(1), object(1)
memory usage: 620.9+ KB


In [8]:
# Parent class
class Test_handler():
    
  def cleanTest(self, df):  
    
#     df['Price'] = pd.to_numeric(df['Price'].replace({'$': '', 'nan': np.nan}), errors='coerce')

    # check if 'Price' column contains 'nan' or '$' values
    if 'nan' in df['Price'].values or '$' in df['Price'].values:
        # replace 'nan' values with NaN
        df['Price'] = df['Price'].replace('nan', np.nan)
        # remove the '$' sign from the 'Price' column and convert to float
        df['Price'] = df['Price'].str.replace('$', '').astype(float)
    
#     df['Price_Target'] =pd.to_numeric(df['Price_Target'].replace({'$': '', 'nan': np.nan}), errors='coerce')

    # check if 'Price_Target' column contains 'nan' or '$' values
    if 'nan' in df['Price_Target'].values or '$' in df['Price_Target'].values:
        # replace 'nan' values with NaN
        df['Price_Target'] = df['Price_Target'].replace('nan', np.nan)
        # remove the '$' sign from the 'Price' column and convert to float
        df['Price_Target'] = df['Price_Target'].str.replace('$', '').astype(float)
        
    # replace values in the 'Price_Target' column that contain 'k' with their numeric value in thousands
    if 'Price_Target' in df.columns and df['Price_Target'].dtype == 'object':
        df.loc[df['Price_Target'].str.contains('k', na=False, regex=True), 'Price_Target'] = df.loc[df['Price_Target'].str.contains('k', na=False, regex=True), 'Price_Target'].apply(lambda x: float(x.replace('k', '')) * 1000)

    df['Price_Target'] = df['Price_Target'].astype(float)
    
    return df

In [9]:

class ModelPipeline(Test_handler):
    
    def __init__(self):
      self.__AddMissingIndicatorVariables = ['Price','Price_Target']
      self.__MeanMedianImputerVarables = ['Price_Target']
      self.__WinsorizerCappingVarables = ['open', 'low', 'close', 'volume', 'high', 'adjclose', 'Price', 'Price_Target']
      self.__LogTransformerVarables = ['open', 'low', 'close', 'volume', 'high', 'adjclose', 'Price', 'Price_Target']
      self.__OneHotEncoderVariables = ['Ticker']
      

      self.pipe = Pipeline([
          
          #  Missing indicator
          ('Add missing indicator',AddMissingIndicator(
            variables=self.__AddMissingIndicatorVariables)),
    
          #   Median Missing Imputation
          ('Median Missing Imputation',MeanMedianImputer(
            imputation_method='median', variables=self.__MeanMedianImputerVarables)),

          #   Outlier Imputation
          ('Winsorizer_CAPPING',Winsorizer(
            capping_method='iqr', variables=self.__WinsorizerCappingVarables)),        

          # Feature Transformation
          ('LogTransformer',LogTransformer(
            variables=self.__LogTransformerVarables)),

          #  OneHotEncoder
          ('OneHotEncoder',OneHotEncoder(
            drop_last=True,variables=self.__OneHotEncoderVariables)),

          # Model Train
          ('Model', xgb.XGBClassifier(colsample_bytree=0.8, learning_rate=0.1, max_depth=5, n_estimators=200, subsample=0.8))
      ])
    
    def handleData(self, df):
        
        df = super().cleanTest(df)  # call parent class's cleanTest() method       
        df['created_date'] = pd.to_datetime(df['created_date'], format='%Y-%m-%d').astype('int64') // 10**9
        df['Ticker'] = np.where(df['Ticker'].map(df['Ticker'].value_counts()) >= 10, df['Ticker'], 'Other')
#         df = df.drop(columns=['Unnamed: 0', 'Unnamed: 0.1', 'splits', 'dividends'], axis=1).iloc[:, :]
        df['Consensus'] = df['Consensus'].apply(lambda x: np.where(x == 'none', np.nan, x))
        # df['Consensus'] = df['Consensus'].replace({'none': np.nan}).map({'Strong Buy': 0, 'Buy': 1, 'Hold': 2, 'Sell': 3, 'Strong Sell': 3}.get)
        df = df.dropna(subset=['Consensus'])
        return df
    
    def train(self, df):
        df = self.handleData(df)

        X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['Consensus']),
                                                            df['Consensus'],
                                                            test_size=0.2,
                                                            random_state=44)

        self.pipe.fit(X_train, y_train)
    def predict(self, df):
        df = super().cleanTest(df) # call parent class's cleanTest() method
        df['created_date'] = pd.to_datetime(df['created_date'], format='%Y-%m-%d').astype('int64') // 10**9
        df['Ticker'] = np.where(df['Ticker'].map(df['Ticker'].value_counts()) >= 10, df['Ticker'], 'Other')
#         df['Consensus'] = np.nan # set the target column to nan
#         X = self.pipe.transform(df)
        y_pred = self.pipe.predict(df)
        return y_pred

In [10]:
model = ModelPipeline()
model.train(df)

In [14]:
model.predict(df1)

array([1, 0], dtype=int64)

In [15]:
# make predictions on the test data
predictions=model.predict(df1)

# print the predictions
print(predictions)

[1 0]


In [11]:
df1=df.iloc[[6577,9]]
df1.sample(2)

Unnamed: 0,Ticker,open,low,close,volume,high,adjclose,created_date,Price,Price_Target,Consensus
6577,CAT,227.0,225.779999,228.979996,3044000,229.839996,227.893158,1670371200,228.29,222.42,1.0
9,AAPL,138.5,134.589996,134.869995,74917800,138.550003,134.664383,1667952000,139.5,182.59,0.0


In [12]:
df1 = df1.drop(columns=['Consensus'], axis=1)

In [13]:
df1.head(1)

Unnamed: 0,Ticker,open,low,close,volume,high,adjclose,created_date,Price,Price_Target
6577,CAT,227.0,225.779999,228.979996,3044000,229.839996,227.893158,1670371200,228.29,222.42


In [18]:
# save model to pickle file
with open('model1.pk1', 'wb') as f:
    pickle.dump(model, f)

In [36]:
# import numpy as np

# # create a NumPy array with the data you want to predict on
# data = np.array([
#     ["CAT", 227.0, 225.779999, 228.979996, 3044000, 229.839996, 227.893158, 1670371200, 228.29, 222.42]
# ])

# # make predictions on the data using the trained model
# predictions = model.predict(data[:, 1:])

# # print the predictions
# print(predictions)


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices