In [1]:
import numpy as np
import pandas as pd 
import statsmodels.api as sm
import statsmodels.formula.api as smf
import seaborn as sns
from sklearn.preprocessing import scale 
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import roc_auc_score,roc_curve
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier 
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.tree import DecisionTreeClassifier
import warnings            
warnings.filterwarnings("ignore")

In [2]:
def drop_none(data, col_name):
    data.drop(index = data[data[col_name]=='NONE'].index, inplace = True)
    return data
# data_result.drop(index = data_result[data_result['30_signal']=='NONE'].index, inplace = True)

In [3]:
def prepare_data(data):
    # verinin okunması
    # soru işaretleri olan satırlar veriden silindi.
    data = data.replace("?", np.nan)
    data = data.dropna()
    # verinin feature larının ayrılması
    df = data.iloc[:,5:430]
    df_first = data.iloc[:,1:5]
    df_result = data.iloc[:,430:-1]
    df_result = df_result.astype('category')
    return df, df_first, df_result

In [4]:
def y_ortalama():
    y_max1 = df_result.mode(axis=1)
    y = pd.DataFrame(y_max1[0])
    return y

In [5]:
# parametrik fonk. tanımlaması
# 1 : 
# 2 :

def kategorikleri_dummy_yap(df):
    cat_column_names = ['ind_7','ind_11','ind_24','ind_38','ind_54','ind_57','ind_60','ind_63','ind_66','ind_69','ind_72','ind_75',
                    'ind_78','ind_81','ind_84','ind_87','ind_89','ind_91','ind_93','ind_95','ind_97','ind_99','ind_101',
                    'ind_103','ind_105','ind_107','ind_109', 'ind_111', 'ind_113', 'ind_115','ind_138','ind_141','ind_144',
                    'ind_157','ind_159','ind_161','ind_163','ind_165','ind_167','ind_169','ind_171','ind_173','ind_175',
                    'ind_177','ind_182','ind_184','ind_187','ind_190','ind_193','ind_196','ind_199','ind_202','ind_205',
                    'ind_208','ind_211','ind_213','ind_384','ind_386','ind_388','ind_390']
    # categorical kolonların dummy var. oalrak değiştirdik
    dms = pd.get_dummies(df[cat_column_names])
    dms_none_cols = dms.filter(regex = '_NONE').columns
    for i in dms_none_cols:
        dms.drop(i,axis=1,inplace=True)
    dms_red_cols = dms.filter(regex = '_RED').columns
    for i in dms_red_cols:
        dms.drop(i,axis=1,inplace=True)
    #datadan categorical olan kolonları çıkarıyoruz ve type nı değiştiriyoruz
    df_noncategoric = df.drop(cat_column_names,axis=1).astype("float64")
    df_noncategoric = pd.DataFrame(df_noncategoric)
    df_all = pd.concat([df_noncategoric, dms], axis=1)
    # y değerlerinin alınması
    return df_all, df_noncategoric, dms

In [6]:
# 1.1 - dropping correlaritions
def corr_df(df, corr_val):
    corr_matrix = df_noncategoric.corr().abs()
    # Select upper triangle of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

    # Find index of feature columns with correlation greater than 0.95
    to_high = [column for column in upper.columns if any(upper[column] > corr_val)]
    df.drop(to_high, axis = 1, inplace = True)
    return df

In [7]:
# 1.2 RandomForest
# bütün değişkenlerle yapılan random forest sonucu importance değeri verilen parametreden büyük olan değişkenleri döner
def rand_forest(X, y, imp_value):
    rf_model = RandomForestClassifier().fit(X, y)
    Importance = pd.DataFrame({'Importance':rf_model.feature_importances_*100}, index = X.columns)
    imp_values = Importance.sort_values(by = 'Importance', axis = 0, ascending = True)
    imp_values = imp_values[imp_values['Importance']>imp_value]
    col_names = imp_values.index   
    return X[col_names]

In [8]:
# 1.3 - pca
def pca_fon(X, threshold):
    pca = PCA()
    X_pca = pca.fit_transform(scale(X))
    arr = np.cumsum(np.round(pca.explained_variance_ratio_, decimals = 4)*100)
    num_var = sum((arr < threshold*100)) + 1 
    print('pca sonrası değişken sayısı: ',num_var)
    X_pcad = pd.DataFrame(X_pca[:,0:num_var], index = X.index)
    return X_pcad

In [9]:
def splitting(X, y, test_size):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, shuffle = False)
    return X_train, X_test, y_train, y_test

In [10]:
# 2.1.1 - multi lojistik
def multi_logit(X_train, X_test, y_train, y_test):
    logreg = LogisticRegression(C=1e5, solver='lbfgs', multi_class='multinomial')
    log = logreg.fit(X_train, y_train)
    y_pred = log.predict(X_test)
    confusion_mat = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy: ",accuracy)
    print('-------------------------------')
    print("Counfusion matrix: \n",confusion_mat)
    print('-------------------------------')
    print('Classification report')
    print(classification_report(y_test, y_pred))

In [11]:
# 2.1.2 - decision tree
def dec_tree(X_train, X_test, y_train, y_test):
    cart = DecisionTreeClassifier()
    cart_model = cart.fit(X_train, y_train)
    y_pred = cart_model.predict(X_test)
    confusion_mat = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy: ",accuracy)
    print('-------------------------------')
    print("Counfusion matrix: \n",confusion_mat)
    print('-------------------------------')
    print('Classification report')
    print(classification_report(y_test, y_pred))

In [12]:
def grad_boost(X_train, X_test, y_train, y_test):
    from sklearn.ensemble import GradientBoostingClassifier
    print('grad_boost----------------')
    gbm_model = GradientBoostingClassifier().fit(X_train, y_train)
    y_pred = gbm_model.predict(X_test)
    confusion_mat = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy: ",accuracy)
    print('-------------------------------')
    print("Counfusion matrix: \n",confusion_mat)
    print('-------------------------------')
    print('Classification report')
    print(classification_report(y_test, y_pred))

In [13]:
def xgb_boost(X_train, X_test, y_train, y_test):
    from xgboost import XGBClassifier
    print('xgb_boost----------------')
    xgb_model = XGBClassifier().fit(X_train, y_train)
    y_pred = xgb_model.predict(X_test)
    confusion_mat = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy: ",accuracy)
    print('-------------------------------')
    print("Counfusion matrix: \n",confusion_mat)
    print('-------------------------------')
    print('Classification report')
    print(classification_report(y_test, y_pred))

In [14]:
def lightGBM(X_train, X_test, y_train, y_test):
    from lightgbm import LGBMClassifier
    print('lightGBM----------------')
    lgbm_model = LGBMClassifier(verbose=-1).fit(X_train,y_train)    
    y_pred = lgbm_model.predict(X_test)
    confusion_mat = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy: ",accuracy)
    print('-------------------------------')
    print("Counfusion matrix: \n",confusion_mat)
    print('-------------------------------')
    print('Classification report')
    print(classification_report(y_test, y_pred))

In [15]:
def catBoost(X_train, X_test, y_train, y_test):
    from catboost import CatBoostClassifier
    print('CatBoost----------------')
    cat_model = CatBoostClassifier().fit(X_train, y_train)
    y_pred = cat_model.predict(X_test)
    confusion_mat = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy: ",accuracy)
    print('-------------------------------')
    print("Counfusion matrix: \n",confusion_mat)
    print('-------------------------------')
    print('Classification report')
    print(classification_report(y_test, y_pred))

In [16]:
# 2.1.3 - Boosting
def boostings(X_train, X_test, y_train, y_test):
    grad_boost(X_train, X_test, y_train, y_test)
    xgb_boost(X_train, X_test, y_train, y_test)
    # lightGBM(X_train, X_test, y_train, y_test)
    # catBoost(X_train, X_test, y_train, y_test)

In [17]:
# 2.1.2 - decision tree
def dec_tree_withcv(X_train, X_test, y_train, y_test):
    cart = DecisionTreeClassifier()
    cart_model = cart.fit(X_train, y_train)
    y_pred = cart_model.predict(X_test)
    confusion_mat = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy: ",accuracy)
    print('-------------------------------')
    print("Counfusion matrix: \n",confusion_mat)
    print('-------------------------------')
    print('Classification report')
    print(classification_report(y_test, y_pred))

In [18]:
# verinin okunması - df: ilk 5 kolon ve result'lar hariç kolonlar, df_first: ilk 5 kolon, df_result: sonuç kolonları
data = pd.read_csv('EurUsd.csv')
data = drop_none(data, '80_signal')
df, df_first, df_result = prepare_data(data)
y = df_result['80_signal']

In [19]:
# 1.1 den gelen veriler (non correlatedlardan gelenler)
df_all, df_noncategoric, dms = kategorikleri_dummy_yap(df)
df_noncorr = corr_df(df_noncategoric, 0.50)
X1_1 = pd.concat([df_first, df_noncorr, dms], axis=1)
X1_1.shape  #non correlatedları çıkarınca elimizde 204 kolon kaldı

(16872, 206)

In [20]:
# 1.2 den gelen veriler. 
# Notlar: 
# 1 - y için iterasyon denenebilir. y kolonu '220_signal' seçilmiştir.
# 2- importance treshold'u 0.05 seçilmiştir, cv yapılabilir.
X_raw = pd.concat([df_first,df_all], axis=1) 
X1_2 = rand_forest(X_raw, y, 0.05)    
X1_2.shape #elimizde 285 kolon kaldı

(16872, 268)

In [21]:
# 1.3 den gelen veriler.
X_raw2 = pd.concat([df_first,df_all], axis=1) 
X1_3 = pca_fon(X_raw2, 0.99)
X_raw2.shape, X1_3.shape

pca sonrası değişken sayısı:  189


((16872, 486), (16872, 189))

In [22]:
X_train1, X_test1, y_train, y_test = splitting(X1_1, y, 0.30)
print('X1_1 için multi log')
multi_logit(X_train1, X_test1, y_train, y_test)
print('*************************************************************************************')

X1_1 için multi log
Accuracy:  0.4685894903200316
-------------------------------
Counfusion matrix: 
 [[ 841 1345]
 [1345 1531]]
-------------------------------
Classification report
              precision    recall  f1-score   support

         BUY       0.38      0.38      0.38      2186
        SELL       0.53      0.53      0.53      2876

    accuracy                           0.47      5062
   macro avg       0.46      0.46      0.46      5062
weighted avg       0.47      0.47      0.47      5062

*************************************************************************************


In [23]:
X_train2, X_test2, y_train, y_test = splitting(X1_2, y, 0.30)
print('X1_2 için multi log')
multi_logit(X_train2, X_test2, y_train, y_test)
print('*************************************************************************************')

X1_2 için multi log
Accuracy:  0.46483603318846306
-------------------------------
Counfusion matrix: 
 [[ 717 1469]
 [1240 1636]]
-------------------------------
Classification report
              precision    recall  f1-score   support

         BUY       0.37      0.33      0.35      2186
        SELL       0.53      0.57      0.55      2876

    accuracy                           0.46      5062
   macro avg       0.45      0.45      0.45      5062
weighted avg       0.46      0.46      0.46      5062

*************************************************************************************


In [24]:
X_train3, X_test3, y_train, y_test = splitting(X1_3, y, 0.30)
print('X1_3 için multi log')
multi_logit(X_train3, X_test3, y_train, y_test)
print('*************************************************************************************')

X1_3 için multi log
Accuracy:  0.46286052943500594
-------------------------------
Counfusion matrix: 
 [[1429  757]
 [1962  914]]
-------------------------------
Classification report
              precision    recall  f1-score   support

         BUY       0.42      0.65      0.51      2186
        SELL       0.55      0.32      0.40      2876

    accuracy                           0.46      5062
   macro avg       0.48      0.49      0.46      5062
weighted avg       0.49      0.46      0.45      5062

*************************************************************************************


In [25]:
X_train1, X_test1, y_train, y_test = splitting(X1_1, y, 0.30)
print('X1_1 için dec tree')
dec_tree(X_train1, X_test1, y_train, y_test)
print('*************************************************************************************')

X1_1 için dec tree
Accuracy:  0.5517581983405768
-------------------------------
Counfusion matrix: 
 [[1488  698]
 [1571 1305]]
-------------------------------
Classification report
              precision    recall  f1-score   support

         BUY       0.49      0.68      0.57      2186
        SELL       0.65      0.45      0.53      2876

    accuracy                           0.55      5062
   macro avg       0.57      0.57      0.55      5062
weighted avg       0.58      0.55      0.55      5062

*************************************************************************************


In [26]:
X_train2, X_test2, y_train, y_test = splitting(X1_2, y, 0.30)
print('X1_2 için dec tree')
dec_tree(X_train2, X_test2, y_train, y_test)
print('*************************************************************************************')

X1_2 için dec tree
Accuracy:  0.5539312524693797
-------------------------------
Counfusion matrix: 
 [[1110 1076]
 [1182 1694]]
-------------------------------
Classification report
              precision    recall  f1-score   support

         BUY       0.48      0.51      0.50      2186
        SELL       0.61      0.59      0.60      2876

    accuracy                           0.55      5062
   macro avg       0.55      0.55      0.55      5062
weighted avg       0.56      0.55      0.56      5062

*************************************************************************************


In [27]:
X_train3, X_test3, y_train, y_test = splitting(X1_3, y, 0.30)
print('X1_3 için dec tree')
dec_tree(X_train3, X_test3, y_train, y_test)
print('*************************************************************************************')

#for i in y.columns:
    #print(i, ' kolonu için sonuçlar:')
    #dec_tree(X_train3, X_test3, y_train[i], y_test[i])
    #print('*************************************************************************************')3

X1_3 için dec tree
Accuracy:  0.5005926511260371
-------------------------------
Counfusion matrix: 
 [[1083 1103]
 [1425 1451]]
-------------------------------
Classification report
              precision    recall  f1-score   support

         BUY       0.43      0.50      0.46      2186
        SELL       0.57      0.50      0.53      2876

    accuracy                           0.50      5062
   macro avg       0.50      0.50      0.50      5062
weighted avg       0.51      0.50      0.50      5062

*************************************************************************************


# yol haritası

1. verilerin sadeleştirilmesi 

    1.1 correlatedları atarak non correlated ları bul 
 
    1.2 RandomForest'dan important değişkenleri bul 
 
    1.3 pca  
 
 
2. algoritmalar 

    2.1 algoritmaları fonk. olarak yaz
 
        2.1.1 loj reg
  
        2.1.2 decision tree
  
        2.1.3 boosting
      
    2.2 cross validations
 
    2.3 1'de bulduğun verilerle bütün algoritmaları çalıştır, sonuçları kıyasla 
 
 Notlar:
 - ilk 5 sütun correlation a koyulmadı. bunların da koyulması gerekir mi?

#### Desicion Tree

#### Non corelated X'ler için 

In [29]:
X_train1, X_test1, y_train, y_test = splitting(X1_1, y, 0.30)
cart1 = DecisionTreeClassifier()
cart_model1 = cart1.fit(X_train1, y_train)
y_pred = cart_model1.predict(X_test1)
confusion_mat = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ",accuracy)
print('-------------------------------')
print("Counfusion matrix: \n",confusion_mat)
print('-------------------------------')
print('Classification report')
print(classification_report(y_test, y_pred))

Accuracy:  0.5242986961675227
-------------------------------
Counfusion matrix: 
 [[1804  382]
 [2026  850]]
-------------------------------
Classification report
              precision    recall  f1-score   support

         BUY       0.47      0.83      0.60      2186
        SELL       0.69      0.30      0.41      2876

    accuracy                           0.52      5062
   macro avg       0.58      0.56      0.51      5062
weighted avg       0.60      0.52      0.49      5062



In [30]:
cart_grid1 = {"max_depth":[50,100,200,300], "min_samples_split":[100,150,200,300]}
cart_cv1 = GridSearchCV(cart1, cart_grid1, cv=10, n_jobs =-1, verbose = 2)
cart_cv_model1 = cart_cv1.fit(X_train1, y_train)
print('En iyi parametreler : ' + str(cart_cv_model1.best_params_))

Fitting 10 folds for each of 16 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed:    9.9s finished


En iyi parametreler : {'max_depth': 50, 'min_samples_split': 200}


In [40]:
cart_1 = DecisionTreeClassifier(max_depth =50 , min_samples_split=200 )
cart_tuned1 = cart_1.fit(X_train1, y_train)
y_pred = cart_tuned1.predict(X_test1)
accuracy = accuracy_score(y_test, y_pred)
print('accuracy : ', accuracy)
confusion_mat = confusion_matrix(y_test, y_pred)
print('-------------------------------')
print("Counfusion matrix: \n",confusion_mat)
print(classification_report(y_test, y_pred))

accuracy :  0.5655867246147768
-------------------------------
Counfusion matrix: 
 [[1584  602]
 [1597 1279]]
              precision    recall  f1-score   support

         BUY       0.50      0.72      0.59      2186
        SELL       0.68      0.44      0.54      2876

    accuracy                           0.57      5062
   macro avg       0.59      0.58      0.56      5062
weighted avg       0.60      0.57      0.56      5062



#### Random Forest ile bulduğumuz importance X'ler için

In [32]:
X_train2, X_test2, y_train, y_test = splitting(X1_2, y, 0.30)
cart2 = DecisionTreeClassifier()
cart_model2 = cart2.fit(X_train2, y_train)
y_pred = cart_model2.predict(X_test2)
confusion_mat = confusion_matrix(y_test, y_pred)
print('-------------------------------')
print("Counfusion matrix: \n",confusion_mat)
print('-------------------------------')
print('Classification report')
print(classification_report(y_test, y_pred))

-------------------------------
Counfusion matrix: 
 [[1073 1113]
 [1238 1638]]
-------------------------------
Classification report
              precision    recall  f1-score   support

         BUY       0.46      0.49      0.48      2186
        SELL       0.60      0.57      0.58      2876

    accuracy                           0.54      5062
   macro avg       0.53      0.53      0.53      5062
weighted avg       0.54      0.54      0.54      5062



In [33]:
cart_grid2 = {"max_depth":[50,100,200,300], "min_samples_split":[100,150,200,300]}
cart_cv2 = GridSearchCV(cart2, cart_grid2, cv=10, n_jobs =-1, verbose = 2)
cart_cv_model2 = cart_cv2.fit(X_train2, y_train)
print('En iyi parametreler : ' + str(cart_cv_model2.best_params_))

Fitting 10 folds for each of 16 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    5.8s
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed:   39.9s finished


En iyi parametreler : {'max_depth': 100, 'min_samples_split': 300}


In [39]:
cart_2 = DecisionTreeClassifier(max_depth = 100, min_samples_split=300)
cart_tuned_2 = cart_2.fit(X_train2, y_train)
y_pred = cart_tuned_2.predict(X_test2)
confusion_mat = confusion_matrix(y_test, y_pred)
print('-------------------------------')
print("Counfusion matrix: \n",confusion_mat)
print(classification_report(y_test, y_pred))

-------------------------------
Counfusion matrix: 
 [[1091 1095]
 [1283 1593]]
              precision    recall  f1-score   support

         BUY       0.46      0.50      0.48      2186
        SELL       0.59      0.55      0.57      2876

    accuracy                           0.53      5062
   macro avg       0.53      0.53      0.53      5062
weighted avg       0.54      0.53      0.53      5062



#### PCA 

In [35]:
X_train3, X_test3, y_train, y_test = splitting(X1_3, y, 0.30)
cart3 = DecisionTreeClassifier()
cart_model3 = cart3.fit(X_train3, y_train)
y_pred = cart_model3.predict(X_test3)
confusion_mat = confusion_matrix(y_test, y_pred)
print('-------------------------------')
print("Counfusion matrix: \n",confusion_mat)
print('-------------------------------')
print('Classification report')
print(classification_report(y_test, y_pred))

-------------------------------
Counfusion matrix: 
 [[1072 1114]
 [1430 1446]]
-------------------------------
Classification report
              precision    recall  f1-score   support

         BUY       0.43      0.49      0.46      2186
        SELL       0.56      0.50      0.53      2876

    accuracy                           0.50      5062
   macro avg       0.50      0.50      0.49      5062
weighted avg       0.51      0.50      0.50      5062



In [36]:
cart_grid3 = {"max_depth":[50,100,200,300], "min_samples_split":[100,150,200,300]}
cart_cv3 = GridSearchCV(cart3, cart_grid3, cv=10, n_jobs =-1, verbose = 2)
cart_cv_model3 = cart_cv3.fit(X_train3, y_train)
print('En iyi parametreler : ' + str(cart_cv_model3.best_params_))

Fitting 10 folds for each of 16 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed:   36.4s finished


En iyi parametreler : {'max_depth': 50, 'min_samples_split': 200}


In [38]:
cart_3 = DecisionTreeClassifier(max_depth = 50, min_samples_split=200)
cart_tuned_3 = cart_3.fit(X_train3, y_train)
y_pred = cart_tuned_3.predict(X_test3)
confusion_mat = confusion_matrix(y_test, y_pred)
print('-------------------------------')
print("Counfusion matrix: \n",confusion_mat)
print('-------------------------------')
print(classification_report(y_test, y_pred))

-------------------------------
Counfusion matrix: 
 [[1154 1032]
 [1534 1342]]
-------------------------------
              precision    recall  f1-score   support

         BUY       0.43      0.53      0.47      2186
        SELL       0.57      0.47      0.51      2876

    accuracy                           0.49      5062
   macro avg       0.50      0.50      0.49      5062
weighted avg       0.51      0.49      0.49      5062



#### Random Forest

#### Non Corelated X'ler ile

In [41]:
X_train1, X_test1, y_train, y_test = splitting(X1_1, y, 0.30)
from sklearn.ensemble import RandomForestClassifier
rf_model1 = RandomForestClassifier().fit(X_train1, y_train)
y_pred = rf_model1.predict(X_test1)
confusion_mat = confusion_matrix(y_test, y_pred)
print('-------------------------------')
print("Counfusion matrix: \n",confusion_mat)
print('-------------------------------')
print('Classification report')
print(classification_report(y_test, y_pred))

-------------------------------
Counfusion matrix: 
 [[1938  248]
 [2431  445]]
-------------------------------
Classification report
              precision    recall  f1-score   support

         BUY       0.44      0.89      0.59      2186
        SELL       0.64      0.15      0.25      2876

    accuracy                           0.47      5062
   macro avg       0.54      0.52      0.42      5062
weighted avg       0.56      0.47      0.40      5062



In [42]:
rf_params1 = {"max_depth":[5,7,8,10],
             "min_samples_split":[100,150,200,300],
             "max_features": [2,5,8], 
             "n_estimators": [10,500,1000]}
rf_model1 = RandomForestClassifier()
rf_cv_model1 = GridSearchCV(rf_model1,
                            rf_params1,
                            cv=10,
                            n_jobs=-1,
                            verbose=5)
rf_cv_model1.fit(X_train1, y_train)
print("En iyi parametreler: "+str(rf_cv_model1.best_params_))

Fitting 10 folds for each of 144 candidates, totalling 1440 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:    8.7s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   29.1s
[Parallel(n_jobs=-1)]: Done 264 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 858 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 1128 tasks      | elapsed:  7.0min


En iyi parametreler: {'max_depth': 7, 'max_features': 2, 'min_samples_split': 150, 'n_estimators': 10}


[Parallel(n_jobs=-1)]: Done 1440 out of 1440 | elapsed:  9.5min finished


In [43]:
rf_tuned_1 = RandomForestClassifier(max_depth=7, max_features=2, min_samples_split=150, n_estimators=10)
rf_tuned_1.fit(X_train1, y_train)
y_pred = rf_tuned_1.predict(X_test1)
confusion_mat = confusion_matrix(y_test, y_pred)
print('-------------------------------')
print("Counfusion matrix: \n",confusion_mat)
print('-------------------------------')
print('Classification report')
print(classification_report(y_test, y_pred))

-------------------------------
Counfusion matrix: 
 [[1834  352]
 [2277  599]]
-------------------------------
Classification report
              precision    recall  f1-score   support

         BUY       0.45      0.84      0.58      2186
        SELL       0.63      0.21      0.31      2876

    accuracy                           0.48      5062
   macro avg       0.54      0.52      0.45      5062
weighted avg       0.55      0.48      0.43      5062



#### Random Forest ile bulduğumuz importance X'ler için

In [44]:
X_train2, X_test2, y_train, y_test = splitting(X1_2, y, 0.30)
from sklearn.ensemble import RandomForestClassifier
rf_model2 = RandomForestClassifier().fit(X_train2, y_train)
y_pred = rf_model2.predict(X_test2)
confusion_mat = confusion_matrix(y_test, y_pred)
print('-------------------------------')
print("Counfusion matrix: \n",confusion_mat)
print('-------------------------------')
print('Classification report')
print(classification_report(y_test, y_pred))

-------------------------------
Counfusion matrix: 
 [[1892  294]
 [2441  435]]
-------------------------------
Classification report
              precision    recall  f1-score   support

         BUY       0.44      0.87      0.58      2186
        SELL       0.60      0.15      0.24      2876

    accuracy                           0.46      5062
   macro avg       0.52      0.51      0.41      5062
weighted avg       0.53      0.46      0.39      5062



In [45]:
rf_params2 = {"max_depth":[5,7,8,10],
             "min_samples_split":[100,150,200,300],
             "max_features": [2,5,8], 
             "n_estimators": [10,500,1000]}
rf_model2 = RandomForestClassifier()
rf_cv_model2 = GridSearchCV(rf_model2,
                            rf_params2,
                            cv=10,
                            n_jobs=-1,
                            verbose=5)
rf_cv_model2.fit(X_train2, y_train)
print("En iyi parametreler: "+str(rf_cv_model2.best_params_))

Fitting 10 folds for each of 144 candidates, totalling 1440 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:   16.1s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   53.5s
[Parallel(n_jobs=-1)]: Done 264 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed:  8.0min
[Parallel(n_jobs=-1)]: Done 858 tasks      | elapsed: 12.1min
[Parallel(n_jobs=-1)]: Done 1128 tasks      | elapsed: 17.8min


En iyi parametreler: {'max_depth': 5, 'max_features': 2, 'min_samples_split': 300, 'n_estimators': 10}


[Parallel(n_jobs=-1)]: Done 1440 out of 1440 | elapsed: 25.0min finished


In [46]:
rf_tuned_2 = RandomForestClassifier(max_depth=5, max_features=2, min_samples_split=300, n_estimators=10)
rf_tuned_2.fit(X_train2, y_train)
y_pred = rf_tuned_2.predict(X_test2)
confusion_mat = confusion_matrix(y_test, y_pred)
print('-------------------------------')
print("Counfusion matrix: \n",confusion_mat)
print('-------------------------------')
print('Classification report')
print(classification_report(y_test, y_pred))

-------------------------------
Counfusion matrix: 
 [[1388  798]
 [2016  860]]
-------------------------------
Classification report
              precision    recall  f1-score   support

         BUY       0.41      0.63      0.50      2186
        SELL       0.52      0.30      0.38      2876

    accuracy                           0.44      5062
   macro avg       0.46      0.47      0.44      5062
weighted avg       0.47      0.44      0.43      5062



#### PCA

In [47]:
X_train3, X_test3, y_train, y_test = splitting(X1_3, y, 0.30)
from sklearn.ensemble import RandomForestClassifier
rf_model3 = RandomForestClassifier().fit(X_train3, y_train)
y_pred = rf_model3.predict(X_test3)
confusion_mat = confusion_matrix(y_test, y_pred)
print('-------------------------------')
print("Counfusion matrix: \n",confusion_mat)
print('-------------------------------')
print('Classification report')
print(classification_report(y_test, y_pred))

-------------------------------
Counfusion matrix: 
 [[1297  889]
 [1773 1103]]
-------------------------------
Classification report
              precision    recall  f1-score   support

         BUY       0.42      0.59      0.49      2186
        SELL       0.55      0.38      0.45      2876

    accuracy                           0.47      5062
   macro avg       0.49      0.49      0.47      5062
weighted avg       0.50      0.47      0.47      5062



In [48]:
rf_params3 = {"max_depth":[5,7,8,10],
             "min_samples_split":[100,150,200,300],
             "max_features": [2,5,8], 
             "n_estimators": [10,500,1000]}
rf_model3 = RandomForestClassifier()
rf_cv_model3 = GridSearchCV(rf_model3,
                            rf_params3,
                            cv=10,
                            n_jobs=-1,
                            verbose=5)
rf_cv_model3.fit(X_train3, y_train)
print("En iyi parametreler: "+str(rf_cv_model3.best_params_))

Fitting 10 folds for each of 144 candidates, totalling 1440 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:   23.4s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 264 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed: 13.0min
[Parallel(n_jobs=-1)]: Done 858 tasks      | elapsed: 19.5min
[Parallel(n_jobs=-1)]: Done 1128 tasks      | elapsed: 28.7min


En iyi parametreler: {'max_depth': 5, 'max_features': 5, 'min_samples_split': 100, 'n_estimators': 10}


[Parallel(n_jobs=-1)]: Done 1440 out of 1440 | elapsed: 39.7min finished


In [49]:
rf_tuned_3 = RandomForestClassifier(max_depth=5, max_features=5, min_samples_split=100, n_estimators=10)
rf_tuned_3.fit(X_train3, y_train)
y_pred = rf_tuned_3.predict(X_test3)
confusion_mat = confusion_matrix(y_test, y_pred)
print('-------------------------------')
print("Counfusion matrix: \n",confusion_mat)
print('-------------------------------')
print('Classification report')
print(classification_report(y_test, y_pred))

-------------------------------
Counfusion matrix: 
 [[ 673 1513]
 [1039 1837]]
-------------------------------
Classification report
              precision    recall  f1-score   support

         BUY       0.39      0.31      0.35      2186
        SELL       0.55      0.64      0.59      2876

    accuracy                           0.50      5062
   macro avg       0.47      0.47      0.47      5062
weighted avg       0.48      0.50      0.48      5062

