# DATA 6545 Project 1 Evaluation Code
- ver. 1.1
Developed by: Dr. Jie Tao

This is the sample evaluation code provided for your project 1. 
- You should evaluated your processed data using this code whenever possible, and record the results;
- Do not modify this code here - create a __copy__ if you decide to do so.
- Note that due to randomness, although I will use the same code to evaluate your final submissions, the results might be slightly different.

In [None]:
# import required package for data ingestion
import pandas as pd
import numpy as np

# import required packages for splitting data
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

# import required packages for evaluating models
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support

# import `logistic regression` model
from sklearn.linear_model import LogisticRegression

# balance the data
from imblearn.over_sampling import SMOTE

## Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Read in the Data

In [None]:
#### you should change data_path to point to your OWN data file
data_path = '/content/drive/MyDrive/Colab Notebooks/data/Process 3A.csv'
data_df = pd.read_csv(data_path, index_col=0)
data_df.head()

Unnamed: 0,T2_prime,T3_prime,S2_prime,Positive_EPS_Dummy,Pre_IPO_Price_Revision,Post_IPO_Initial_Return
0,0.05425,0.908876,0.009199,1.0,0.0,1.0
1,0.051395,0.898724,0.011105,0.0,1.0,0.0
2,0.061764,0.90935,0.006271,0.0,1.0,0.0
3,0.06163,0.91706,0.007144,0.0,1.0,1.0
4,0.04855,0.888469,0.010047,1.0,0.0,1.0


In [None]:
# get a list of feature names
data_df.columns

Index(['T2_prime', 'T3_prime', 'S2_prime', 'Positive_EPS_Dummy',
       'Pre_IPO_Price_Revision', 'Post_IPO_Initial_Return'],
      dtype='object')

### NOTE:

1. This code only include 1 target here - you can only evaluate 1 target at a time. If you want to evaluate another target, define another `y`.
2. It is the norm you arrange your features as *continuous*, *categorical*, and *target* features. If you do not do it this way, you should use indexing similar to below:
```python
y = data_df['Y']
X = data_df.drop['Y']
```

In [None]:
# define features and target
X = data_df.iloc[:,:-1].values
y = data_df.iloc[:,-1].values
# if you want a secondary target
### y1 = ...

In [None]:
### y should be binary
assert len(np.unique(y)) == 2

In [None]:
X.shape, y.shape

((660, 5), (660,))

In [None]:
# resample/balance the data
# note although we do not balance data this way, 
# this works the best for this project
sm = SMOTE(random_state = 2022) 
X_res, y_res = sm.fit_resample(X, y) 

In [None]:
X_res.shape, y_res.shape

((922, 5), (922,))

In [None]:
# define the model
clf = LogisticRegression(max_iter=2000)

# Evaluation

In [None]:
def my_eval(X, y, classifer = clf, k=10, scoring = 'f1'):
  '''
  return evaluation results (f1-score or ROC_AUC). 
  Built in k-fold evaluation.
  INPUTS:
  ----
  - X: features; DataFrame or Numpy ndarray;
  - y: target; DataFrame or Numpy ndarray;
  - classifier: any sklearn (or its add-on) based classifier
  - k: number of folds in cross validation
  - scoring: evaluation metric ('f1' default or 'roc_auc')
  OUTPUT:
  ----
  bias/variance score of selected metric. Both lower the better
  - bias: mean of the metric over cross validation, measure the accruracy
  - variance: std.ev. of the metric, measure the consistency.
  '''
  scores = []
  for i in range(100):
    #### generate random numbers to shuffle the data for training and test
    np.random.seed(2021)
    random_int = np.random.randint(0,3000)
    #### create cross validation folds
    kfold = model_selection.KFold(n_splits=k, random_state=random_int, shuffle=True)
    #### record the score
    score = model_selection.cross_val_score(clf, X=X, y=y, cv=kfold, scoring=scoring)
    scores.append(score)
  scores = np.array(scores)
  #### we need to calculate the bias (average score) and viariance (std)
  bias, variance = 1 - round(scores.mean(),4), round(scores.std(),4)
  return(bias, variance)

In [None]:
# getting averaged f1_score from 10-fold CV (default)
my_eval(X_res, y_res, clf, 10)

(0.3882, 0.045)

In [None]:
# getting averaged ROC_AUC from 10-fold CV
my_eval(X_res, y_res, clf, 10, 'roc_auc')

(0.3748, 0.0504)