In [1]:
import csv
import numpy as np
import pandas as pd
import glob, os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from datetime import *
from sklearn.svm import SVR
from sklearn import svm
from sklearn.metrics import mean_absolute_error,mean_squared_error
import seaborn as sns
from sklearn.model_selection import GridSearchCV
import warnings; warnings.simplefilter('ignore')
from sklearn.metrics import r2_score
from sklearn.metrics import median_absolute_error
from sklearn.metrics import explained_variance_score

In [2]:
import warnings; warnings.simplefilter('ignore')

In [3]:
def pre_process_data(data,null_threshold):
    """
    Drops Date and Unix Date columns from the data.
    Drops the columns which has null values more than specified null_threshold.
    Replaces infinite values with NAN.
    Drops the rows which has null values.

    Parameters
    ----------
    data : dataframe

    null_threshold : numeric
        numeric value describing the amount of null values that can be present.

    Returns
    -------
    data : dataframe
        an updated dataframe after performing all the opertaions.
    """
    
    data.drop(columns=['Unix Date','Date'],axis=1,inplace=True)
    total = data.shape[0]
    for col in data.columns:
        if null_threshold * total / 100 < data[col].isnull().sum():
            data.drop(columns=[col],axis=1,inplace=True)
    data.replace([np.inf, -np.inf], np.nan, inplace=True)
    data = data.apply(pd.to_numeric,errors='coerce')
    data.dropna(axis=0,inplace=True)
    return data

In [4]:
def dependent_column(data,column):
    """
    Removes all the Next Day columns.
    Removes all the non Growth Rate Columns (GR)
    add the predictor column to list of columns.

    Parameters
    ----------
    data : dataframe

    column : string
        name of the predictor column 

    Returns
    -------
    data : dataframe
        an updated dataframe after performing all the opertaions.
    column : string
        name of the predictor column
    """
    cols = [col for col in data.columns if "next" not in col.lower() and col.lower().endswith("gr")]
    cols.append(column)
    data = data[cols]
    return (data,column)

In [5]:
def create_confusion_matrix(y_pred,y_true):
    
    cm = confusion_matrix(y_true,y_pred)
    accuracy = metrics.accuracy_score(y_true,y_pred)
    precision = metrics.precision_score(y_true,y_pred)
    recall = metrics.recall_score(y_true,y_pred)
    f1_score = metrics.f1_score(y_true,y_pred)
    return {"accuracy":accuracy,"precision":precision,"recall":recall,"f1_score":f1_score,"confusion matrix":cm}

In [6]:
def SVM_classification(df,column = "Next Day Close Price GR",threshold = 0.01):
    df["Target"] = df[column].apply(lambda x : 1 if x >= threshold else 0)
    X = df.drop(columns=["Target",column])
    Y = df["Target"]
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3,random_state = 0)
    param_grid = {'C': [0.1, 1, 10, 100, 1000], 
                  'gamma':[0.1,0.01,0.001,0.0001,0.00001],
                  'kernel': ['sigmoid','rbf','linear']}  
    grid = GridSearchCV(SVC(),param_grid, refit = True, verbose = 3,scoring = "precision") 
    grid.fit(x_train, y_train)
    grid_predictions = grid.predict(x_test) 
    grid.fit(x_train, y_train)
    y_pred = grid.predict(x_test) 
    myres = create_confusion_matrix(y_pred,y_test)
    myres.update(grid.best_params_)
    myres.update({"threshold":threshold})
    return myres


In [7]:
top = pd.read_csv("top.csv")

In [8]:
path = os.path.join(os.getcwd(),"Data/Stock")
path

'f:\\PracticumProject\\StockAnalysisTool\\Data/Stock'

In [24]:
for _, row in top.iterrows():
    name =str(row['security id'])
    df = pd.read_csv(os.path.join(path,"gr" + name+".csv"))
    df = pre_process_data(df,60)
    column = "Next Day Close Price GR"
    df,column = dependent_column(df,column)
    reseach = {}
    for thresold in [0.01,0.02,0.03,0.04,0.05][:1]:
        result = SVM_classification(df,column=column,threshold=thresold)
        reseach.update(result)
    break

Fitting 5 folds for each of 75 candidates, totalling 375 fits
[CV] C=0.1, gamma=0.1, kernel=sigmoid ................................
[CV] .... C=0.1, gamma=0.1, kernel=sigmoid, score=0.274, total=   0.2s
[CV] C=0.1, gamma=0.1, kernel=sigmoid ................................
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[CV] .... C=0.1, gamma=0.1, kernel=sigmoid, score=0.307, total=   0.1s
[CV] C=0.1, gamma=0.1, kernel=sigmoid ................................
[CV] .... C=0.1, gamma=0.1, kernel=sigmoid, score=0.322, total=   0.2s
[CV] C=0.1, gamma=0.1, kernel=sigmoid ................................
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.2s remaining:    0.0s
[CV] .... C=0.1, gamma=0.1, kernel=sigmoid, score=0.250, total=   0.2s
[CV] C=0.1, gamma=0.1, kernel=sigmoid ................................
[CV] .... C=0.1, gamma=0.1, kernel=sigmoid, score=0.300, t

In [20]:
result

{'accuracy': 0.5957683741648107,
 'precision': 0.29435483870967744,
 'recall': 0.2796934865900383,
 'f1_score': 0.2868369351669941,
 'confusion matrix': array([[462, 175],
        [188,  73]], dtype=int64),
 'C': 0.1,
 'gamma': 0.1,
 'kernel': 'sigmoid',
 'threshold': 0.01}