In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Importing Dataset
df_train = pd.read_csv("TRAIN_PREPROCESED_DT.csv")
df_test = pd.read_csv("TEST_PREPROCESED_DT.csv")

df_train_KNN = pd.read_csv("TRAIN_PREPROCESED_KNN.csv")
df_test_KNN = pd.read_csv("TEST_PREPROCESED_KNN.csv")

# Outlier Detection

In [4]:
target_name = 'emotional_intensity'

attributes = [col for col in df_train.columns if col != target_name]

In [5]:
X_train = df_train[attributes]
y_train = df_train[target_name]
X_test = df_test[attributes]
y_test = df_test[target_name]

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

## Decision Tree

### Checking performance of classification Before outlier removal

In [8]:
# Checking performance of classification Before outlier removal
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Define the hyperparameters to search over
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [2, 4, 6, 8, 10],
    'min_samples_split': [2, 4, 6, 8, 10],
    'min_samples_leaf': [1, 2, 3, 4, 5]
}

# Create a decision tree classifier object
clf = DecisionTreeClassifier()

# Create a grid search object
grid_search = GridSearchCV(clf, param_grid, cv=5)

# Fit the grid search object to the data
grid_search.fit(X_train, y_train)

# Use the best estimator to make predictions on the test set
y_pred = grid_search.best_estimator_.predict(X_val)

# Print the evaluation metrics
print('Decision Tree Classifier Performance BEFORE removing outliers:')
print('Accuracy %s' % accuracy_score(y_val, y_pred))
print('F1-score %s' % f1_score(y_val, y_pred, average=None))
print(classification_report(y_val, y_pred))

Decision Tree Classifier Performance BEFORE removing outliers:
Accuracy 0.7308533916849015
F1-score [0.73202614 0.72967033]
              precision    recall  f1-score   support

           0       0.79      0.69      0.73       245
           1       0.68      0.78      0.73       212

    accuracy                           0.73       457
   macro avg       0.73      0.73      0.73       457
weighted avg       0.74      0.73      0.73       457



### # Finding the best K and T

In [9]:
# Finding the best K and T
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

# define a range of values for k and T
k_values = [5, 10, 15, 20]
T_values = [2, 5, 8, 10]

# initialize variables to store the best hyperparameters and performance
max_f1_score = 0
max_accuracy = 0
best_k = None
best_T = None

# iterate over all possible combinations of k and T
for k in k_values:
    for T in T_values:
        
        # calculate the in-degree of each vertex in the kNN graph
        n = X_train.shape[0] # number of vertices in graph
        graph = np.zeros((n, n))
        for i in range(n):
            distances = np.linalg.norm(X_train - X_train.iloc[i], axis=1)
            neighbors = np.argsort(distances)[1:k+1] # exclude self as nearest neighbor
            for j in neighbors:
                graph[i][j] = 1 # add directed edge from i to j
        in_degree = np.sum(graph, axis=0)

        # detect outliers based on in-degree threshold
        outliers = np.where(in_degree <= T)[0]
        inliers = np.where(in_degree > T)[0]

        # remove the outliers from the training set
        X_train_filtered = X_train.iloc[inliers]
        y_train_filtered = y_train.iloc[inliers]

        # train a decision tree classifier
        dtc = DecisionTreeClassifier(random_state=42)
        dtc.fit(X_train_filtered, y_train_filtered)

        # make predictions on the test set
        y_pred = dtc.predict(X_val)

        # evaluate the performance of the decision tree classifier
        f1 = f1_score(y_val, y_pred, average='weighted')
        acc = accuracy_score(y_val, y_pred)
        # print('k =', k, ', T =', T)
        # print('Accuracy:', acc)
        # print('F1 Score:', f1)
        # print(classification_report(y_test, y_pred)) 
        
        # update max_f1_score and max_accuracy if current values are higher
        if f1 > max_f1_score and acc > max_accuracy:
            max_f1_score = f1
            max_accuracy = acc
            best_k = k
            best_T = T
            
print('Best k =', best_k, ', Best T =', best_T)
print('Accuracy:', max_accuracy)
print('F1 Score:', max_f1_score)

Best k = 20 , Best T = 5
Accuracy: 0.7067307692307693
F1 Score: 0.7071046377075165


In [17]:
# Using the found values for K=20 and T=5
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

# print the shape of the dataset before removing outliers
print('Shape of X_train before removing outliers:', X_train.shape)
print('Shape of y_train before removing outliers:', y_train.shape)

# calculate the in-degree of each vertex in the kNN graph
k = 20 # number of neighbors for kNN graph
T = 5 # in-degree threshold for outlier detection
n = X_train.shape[0] # number of vertices in graph
graph = np.zeros((n, n))
for i in range(n):
    distances = np.linalg.norm(X_train - X_train.iloc[i], axis=1)
    neighbors = np.argsort(distances)[1:k+1] # exclude self as nearest neighbor
    for j in neighbors:
        graph[i][j] = 1 # add directed edge from i to j
in_degree = np.sum(graph, axis=0)

# detect outliers based on in-degree threshold
outliers = np.where(in_degree <= T)[0]
inliers = np.where(in_degree > T)[0]

# remove the outliers from the training set
X_train_rem = X_train.iloc[inliers]
y_train_rem = y_train.iloc[inliers]

# train a decision tree classifier
dtc = DecisionTreeClassifier(random_state=42)
dtc.fit(X_train_rem, y_train_rem)

Shape of X_train before removing outliers: (1371, 386)
Shape of X_val before removing outliers: (457, 386)


DecisionTreeClassifier(random_state=42)

In [26]:
# print the shape of the dataset before removing outliers
print('Shape of X_train after removing outliers:', X_train_rem.shape)
print('Shape of y_train after removing outliers:', y_train_rem.shape)

Shape of X_train after removing outliers: (1279, 386)
Shape of y_train after removing outliers: (1279,)


### # Checking performance of classification fter outlier removal

In [16]:
# Checking performance of classification fter outlier removal
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Define the hyperparameters to search over
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [2, 4, 6, 8, 10],
    'min_samples_split': [2, 4, 6, 8, 10],
    'min_samples_leaf': [1, 2, 3, 4, 5]
}

# Create a decision tree classifier object
clf = DecisionTreeClassifier()

# Create a grid search object
grid_search = GridSearchCV(clf, param_grid, cv=5)

# Fit the grid search object to the data
grid_search.fit(X_train_rem, y_train_rem)

# Use the best estimator to make predictions on the test set
y_pred = grid_search.best_estimator_.predict(X_val)

# Print the evaluation metrics
print('Decision Tree Classifier Performance AFTER removing outliers:')
print('Accuracy %s' % accuracy_score(y_val, y_pred))
print('F1-score %s' % f1_score(y_val, y_pred, average=None))
print(classification_report(y_val, y_pred))

Decision Tree Classifier Performance AFTER removing outliers:
Accuracy 0.7133479212253829
F1-score [0.77452668 0.60660661]
              precision    recall  f1-score   support

           0       0.67      0.92      0.77       245
           1       0.83      0.48      0.61       212

    accuracy                           0.71       457
   macro avg       0.75      0.70      0.69       457
weighted avg       0.75      0.71      0.70       457



In [None]:
# Decision Tree Classifier Performance AFTER removing outliers: with X_val
# Accuracy 0.7155361050328227
# F1-score [0.77430556 0.61538462]
#               precision    recall  f1-score   support

#          0.0       0.67      0.91      0.77       245
#          1.0       0.83      0.49      0.62       212

#     accuracy                           0.72       457
#    macro avg       0.75      0.70      0.69       457
# weighted avg       0.74      0.72      0.70       457

In [None]:
# Decision Tree Classifier Performance AFTER removing outliers: with X_test
# Accuracy 0.5384615384615384
# F1-score [0.7 0. ]
#               precision    recall  f1-score   support

#            0       0.54      1.00      0.70       336
#            1       0.00      0.00      0.00       288

#     accuracy                           0.54       624
#    macro avg       0.27      0.50      0.35       624
# weighted avg       0.29      0.54      0.38       624


## KNN

In [19]:
target_name = 'emotional_intensity'

attributes = [col for col in df_train_KNN.columns if col != target_name]

X_train_KNN = df_train_KNN[attributes]
y_train_KNN = df_train_KNN[target_name]
X_test_KNN = df_test_KNN[attributes]
y_test_KNN = df_test_KNN[target_name]

from sklearn.model_selection import train_test_split
X_train_KNN, X_val_KNN, y_train_KNN, y_val_KNN = train_test_split(X_train_KNN, y_train_KNN, test_size=0.25, random_state=42)

### Checking performance of classification Before outlier removal

In [20]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Define the hyperparameters to search over
param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

# Create a k-NN classifier object
clf = KNeighborsClassifier()

# Create a grid search object
grid_search = GridSearchCV(clf, param_grid, cv=5)

# Fit the grid search object to the data
grid_search.fit(X_train_KNN, y_train_KNN)

# Use the best estimator to make predictions on the test set
y_pred_KNN = grid_search.best_estimator_.predict(X_val_KNN)

# Print the evaluation metrics
print('k-NN Classifier Performance BEFORE removing outliers:')
print('Accuracy %s' % accuracy_score(y_val_KNN, y_pred_KNN))
print('F1-score %s' % f1_score(y_val_KNN, y_pred_KNN, average=None))
print(classification_report(y_val_KNN, y_pred_KNN))


k-NN Classifier Performance BEFORE removing outliers:
Accuracy 0.7636761487964989
F1-score [0.78740157 0.73399015]
              precision    recall  f1-score   support

           0       0.76      0.82      0.79       245
           1       0.77      0.70      0.73       212

    accuracy                           0.76       457
   macro avg       0.76      0.76      0.76       457
weighted avg       0.76      0.76      0.76       457



### Finding the best K and T

In [21]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

# define a range of values for k and T
k_values = [5, 10, 15, 20]
T_values = [2, 5, 8, 10]

# initialize variables to store the best hyperparameters and performance
max_f1_score_KNN = 0
max_accuracy_KNN = 0
best_k_KNN = None
best_T_KNN = None

# iterate over all possible combinations of k and T
for k in k_values:
    for T in T_values:
        
        # calculate the in-degree of each vertex in the kNN graph
        n = X_train_KNN.shape[0] # number of vertices in graph
        graph = np.zeros((n, n))
        for i in range(n):
            distances = np.linalg.norm(X_train_KNN - X_train_KNN.iloc[i], axis=1)
            neighbors = np.argsort(distances)[1:k+1] # exclude self as nearest neighbor
            for j in neighbors:
                graph[i][j] = 1 # add directed edge from i to j
        in_degree = np.sum(graph, axis=0)

        # detect outliers based on in-degree threshold
        outliers = np.where(in_degree <= T)[0]
        inliers = np.where(in_degree > T)[0]

        # remove the outliers from the training set
        X_train_filtered_KNN = X_train_KNN.iloc[inliers]
        y_train_filtered_KNN = y_train_KNN.iloc[inliers]

        # train a KNN classifier
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train_filtered_KNN, y_train_filtered_KNN)

        # make predictions on the validation set
        y_pred_KNN = knn.predict(X_val_KNN)

        # evaluate the performance of the KNN classifier
        f1_KNN = f1_score(y_val_KNN, y_pred_KNN, average='weighted')
        acc_KNN = accuracy_score(y_val_KNN, y_pred_KNN)
        
        # update max_f1_score_KNN and max_accuracy_KNN if current values are higher
        if f1_KNN > max_f1_score_KNN and acc_KNN > max_accuracy_KNN:
            max_f1_score_KNN = f1_KNN
            max_accuracy_KNN = acc_KNN
            best_k_KNN = k
            best_T_KNN = T
            
print('Best k_KNN =', best_k_KNN, ', Best T_KNN =', best_T_KNN)
print('Accuracy_KNN:', max_accuracy_KNN)
print('F1 Score_KNN:', max_f1_score_KNN)

Best k_KNN = 15 , Best T_KNN = 5
Accuracy_KNN: 0.7549234135667396
F1 Score_KNN: 0.7488443120637964


In [24]:
# Using the found values for K=20 and T=5
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

# print the shape of the dataset before removing outliers
print('Shape of X_train before removing outliers:', X_train_KNN.shape)
print('Shape of y_train before removing outliers:', y_train_KNN.shape)

# calculate the in-degree of each vertex in the kNN graph
k = 15 # number of neighbors for kNN graph
T = 5 # in-degree threshold for outlier detection

n = X_train_KNN.shape[0] # number of vertices in graph
graph = np.zeros((n, n))
for i in range(n):
    distances = np.linalg.norm(X_train_KNN - X_train_KNN.iloc[i], axis=1)
    neighbors = np.argsort(distances)[1:k+1] # exclude self as nearest neighbor
    for j in neighbors:
        graph[i][j] = 1 # add directed edge from i to j
in_degree = np.sum(graph, axis=0)

# detect outliers based on in-degree threshold
outliers = np.where(in_degree <= T)[0]
inliers = np.where(in_degree > T)[0]

# remove the outliers from the training set
X_train_rem_KNN = X_train_KNN.iloc[inliers]
y_train_rem_KNN = y_train_KNN.iloc[inliers]


Shape of X_train before removing outliers: (1371, 50)
Shape of y_train before removing outliers: (1371,)


In [23]:
# print the shape of the dataset before removing outliers
print('Shape of X_train after removing outliers:', X_train_rem_KNN.shape)
print('Shape of y_train after removing outliers:', y_train_rem_KNN.shape)

Shape of X_train after removing outliers: (1205, 50)
Shape of y_train after removing outliers: (1205,)


### Checking permformance of Outliers removal Algo AFTER REMOVAL using KNN

In [25]:
### KNN

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Define the hyperparameters to search over
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'p': [1, 2]
}

# Create a KNN classifier object
clf = KNeighborsClassifier()

# Create a grid search object
grid_search = GridSearchCV(clf, param_grid, cv=5)

# Fit the grid search object to the data
grid_search.fit(X_train_rem_KNN, y_train_rem_KNN)

# Use the best estimator to make predictions on the test set
y_pred_KNN = grid_search.best_estimator_.predict(X_val_KNN)

# Print the evaluation metrics
print('Decision Tree Classifier Performance AFTER removing outliers:')
print('Accuracy %s' % accuracy_score(y_val_KNN, y_pred_KNN))
print('F1-score %s' % f1_score(y_val_KNN, y_pred_KNN, average=None))
print(classification_report(y_val_KNN, y_pred_KNN))

Decision Tree Classifier Performance AFTER removing outliers:
Accuracy 0.7636761487964989
F1-score [0.79545455 0.72020725]
              precision    recall  f1-score   support

           0       0.74      0.86      0.80       245
           1       0.80      0.66      0.72       212

    accuracy                           0.76       457
   macro avg       0.77      0.76      0.76       457
weighted avg       0.77      0.76      0.76       457

