# KNN

### Part 2 - 80_20

##### Find the best model and compare the results of each model

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [2]:
data = pd.read_csv('yuanta_vector.csv')
data

Unnamed: 0,資產出,總營,滿手金融,深市,鴻智原,沖心得,詮彩晶,益巴菲特,面像,聚愛,...,解惑請,粉紅,趨勢以上,腦正,爆真心,股息宣告,重設,溜滑,label,post_time
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,2022-03-09
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,2022-03-09
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,2022-03-09
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,2022-03-09
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,2022-03-09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8587,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2022-09-03
8588,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2022-09-03
8589,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2022-09-03
8590,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2023-11-18


In [4]:
# label 不為 0 
nonzero_indices = data['label'] != 0
data_nonzero = data[nonzero_indices]

X = data_nonzero.drop(['label', 'post_time'], axis=1) 
y = data_nonzero['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
# pip install imbalanced-learn
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler


# parameter grid
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform'],
    # 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [3, 5, 10, 20, 30],
    'p': [1,2]  
}

resampling_strategies = ['normal','over', 'under']

thresholds = [0.6, 0.8, 0.9]

models_list = []
# Iterate over parameter combinations, resampling strategies, and different thresholds
for n_neighbors in param_grid['n_neighbors']:
    for weights in param_grid['weights']:
        for leaf_size in param_grid['leaf_size']:
            for p in param_grid['p']:
                for strategy in resampling_strategies:
                    for threshold in thresholds:
                        model = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, leaf_size=leaf_size, p=p)
                        # Define resampling technique
                        if strategy == 'over':
                            sampler = RandomOverSampler()
                        elif strategy == 'under':
                            sampler = RandomUnderSampler() 
                        else:
                            # No resampling
                            X_resampled, y_resampled = X_train, y_train

                        # Apply resampling if applicable
                        if strategy in ['over', 'under']:
                            X_resampled, y_resampled = sampler.fit_resample(X_train, y_train)     
                                              
                        # Current parameter values
                        print("Model Parameters:")
                        print("n_neighbors:", n_neighbors)
                        print("weights:", weights)
                        print("leaf_size:", leaf_size)
                        print("p:", p)
                        print("resampling strategy:", strategy)
                        print("threshold:", threshold)
                        print()
                        
                        model.fit(X_resampled, y_resampled)
                        y_pred = model.predict(X_test)
        
                        KNN_accuracy = accuracy_score(y_test, y_pred)
                        print('KNN Accuracy: ', KNN_accuracy)
                        print()
                        
                        # Predicted label statistics
                        y_pred_series = pd.Series(y_pred)
                        value_counts = y_pred_series.value_counts()
                        print('Predicted label statistics: ')
                        print('# 1:', value_counts.get(1, 0))
                        print('# 0:', value_counts.get(0, 0))
                        print('# -1:', value_counts.get(-1, 0))
                        print()
                            
                        # Add predicted labels to DataFrame
                        y_pred_df = pd.DataFrame(y_pred, columns=['predicted_label'], index=X_test.index)
                        y_test_df = pd.DataFrame(y_test, columns=['label'], index=X_test.index)
                        y_pred_df['post_time'] = data.loc[X_test.index, 'post_time']
                        result_df = pd.concat([y_pred_df, y_test_df], axis=1)
                        # print('Result DataFrame:')
                        # print(result_df)
                            
                        # Calculate predicted label counts
                        predicted_label_count = result_df.groupby('post_time')['predicted_label'].value_counts().unstack(fill_value=0)
                        # print('Predicted label counts:')
                        # print(predicted_label_count)                
                            
                        # Calculate actual label counts
                        actual_label_count = data.groupby('post_time')['label'].value_counts().unstack(fill_value=0)
                        merged_counts = pd.concat([predicted_label_count, actual_label_count], axis=1, keys=['Predicted', 'Actual']).dropna()
                        # print('Merged label counts:')
                        # print(merged_counts)
                            
                        # Calculate predicted and actual label expectations
                        predicted_expectation = (merged_counts['Predicted'][1] - merged_counts['Predicted'][-1]) / merged_counts['Predicted'].sum(axis=1)
                        actual_expectation = (merged_counts['Actual'][1] - merged_counts['Actual'][-1]) / merged_counts['Actual'].sum(axis=1)
                            
                        # Convert expectations to rise/fall labels
                        predicted_movement = pd.Series(['Rise' if exp > threshold else 'Fall' if exp < -threshold else 'X' for exp in predicted_expectation], index=merged_counts.index)
                        actual_movement = pd.Series(['Rise' if exp > threshold else 'Fall' if exp < -threshold else 'X' for exp in actual_expectation], index=merged_counts.index)
                        
                        # Merge predicted labels and t+1 actual labels
                        merged_movements = pd.concat([predicted_movement, actual_movement], axis=1, keys=['predicted_movement', 'actual_movement'])
                        merged_movements = merged_movements.dropna()
                            
                        # Calculate elements of confusion matrix
                        predicted_rise_actual_rise = ((predicted_movement == 'Rise') & (actual_movement == 'Rise')).sum()
                        predicted_rise_actual_fall = ((predicted_movement == 'Rise') & (actual_movement == 'Fall')).sum()
                        predicted_fall_actual_rise = ((predicted_movement == 'Fall') & (actual_movement == 'Rise')).sum()
                        predicted_fall_actual_fall = ((predicted_movement == 'Fall') & (actual_movement == 'Fall')).sum()

                        # Put results into DataFrame
                        conf_matrix_df = pd.DataFrame({
                                'Predicted Rise': [predicted_rise_actual_rise, predicted_fall_actual_rise],
                                'Predicted Fall': [predicted_rise_actual_fall, predicted_fall_actual_fall]
                            }, index=['Actual Rise', 'Actual Fall'])

                        # Print results
                        print("Confusion Matrix: ")
                        print(conf_matrix_df)
                        print()
                            
                        # Calculate accuracy
                        accuracy = (predicted_rise_actual_rise + predicted_fall_actual_fall) / (predicted_rise_actual_rise + predicted_rise_actual_fall + predicted_fall_actual_rise + predicted_fall_actual_fall)

                        # Print results
                        print("Accuracy: ", accuracy)
                            

                        # Calculate trade rate
                        true_total_trades = (merged_movements['actual_movement'] != 'X').sum()
                        total_trades = (merged_movements['predicted_movement'] != 'X').sum()
                        total_days = len(merged_movements)
                        true_trade_rate = true_total_trades / total_days
                        trade_rate = total_trades / total_days
                        # print("True Active", true_trade_rate)
                        # print("Active: ", trade_rate)
                            
                        models_list.append({
                                "n_neighbors": n_neighbors,
                                "weights": weights,
                                "leaf_size": leaf_size,
                                "p": p,
                                "resampling_strategy": strategy,
                                "threshhold": threshold,
                                "Accuracy": accuracy,
                                "Active": trade_rate,
                                "True Active": true_trade_rate
                            })
                            
                        if accuracy > 0.6 and trade_rate > 0.4:
                            print(" !!!! Good model !!!! ")
                                
                        print("==================================================")


Model Parameters:
n_neighbors: 3
weights: uniform
leaf_size: 3
p: 1
resampling strategy: normal
threshold: 0.6

KNN Accuracy:  0.5233798195242002

Predicted label statistics: 
# 1: 284
# 0: 0
# -1: 935

Confusion Matrix: 
             Predicted Rise  Predicted Fall
Actual Rise              14               6
Actual Fall              71              79

Accuracy:  0.5470588235294118
Model Parameters:
n_neighbors: 3
weights: uniform
leaf_size: 3
p: 1
resampling strategy: normal
threshold: 0.8

KNN Accuracy:  0.5233798195242002

Predicted label statistics: 
# 1: 284
# 0: 0
# -1: 935

Confusion Matrix: 
             Predicted Rise  Predicted Fall
Actual Rise              12               6
Actual Fall              62              69

Accuracy:  0.5436241610738255
Model Parameters:
n_neighbors: 3
weights: uniform
leaf_size: 3
p: 1
resampling strategy: normal
threshold: 0.9

KNN Accuracy:  0.5233798195242002

Predicted label statistics: 
# 1: 284
# 0: 0
# -1: 935

Confusion Matrix: 
        

#### KNN Results

In [26]:
models_df = pd.DataFrame(models_list)

columns_to_print = ['n_neighbors', 'weights', 'leaf_size','p','resampling_strategy','threshhold','Accuracy']
models_df[columns_to_print]

Unnamed: 0,n_neighbors,weights,leaf_size,p,resampling_strategy,threshhold,Accuracy
0,3,uniform,3,1,normal,0.6,0.547059
1,3,uniform,3,1,normal,0.8,0.543624
2,3,uniform,3,1,normal,0.9,0.537931
3,3,uniform,3,1,over,0.6,0.544379
4,3,uniform,3,1,over,0.8,0.537415
...,...,...,...,...,...,...,...
445,11,uniform,30,2,over,0.8,0.483444
446,11,uniform,30,2,over,0.9,0.463087
447,11,uniform,30,2,under,0.6,0.514793
448,11,uniform,30,2,under,0.8,0.526316


In [8]:
models_df.to_csv('KNN_results.csv', index=False) 

#### Good Models

In [25]:
good_models_df = models_df[(models_df["Accuracy"] > 0.54) & (models_df["Active"] > 0.4)]
sorted_good_models_df = good_models_df.sort_values(by = 'Accuracy', ascending=False)

columns_to_print = ['n_neighbors', 'weights', 'leaf_size','p','resampling_strategy','threshhold','Accuracy']
sorted_good_models_df[columns_to_print]

Unnamed: 0,n_neighbors,weights,leaf_size,p,resampling_strategy,threshhold,Accuracy
394,11,uniform,5,2,under,0.8,0.576389
304,9,uniform,5,2,under,0.8,0.571429
123,5,uniform,5,2,under,0.6,0.570423
57,3,uniform,20,1,over,0.6,0.562500
213,7,uniform,5,2,under,0.6,0.561404
...,...,...,...,...,...,...,...
55,3,uniform,20,1,normal,0.8,0.543624
330,9,uniform,20,1,under,0.6,0.542553
14,3,uniform,3,2,over,0.9,0.541667
260,7,uniform,30,1,under,0.9,0.541667


-------------------------------------------