**Istalling libraries**

In [None]:
pip install -r requirements.txt

**Importing libraries**

In [None]:
import pandas as pd
import numpy as np

#Modelling
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import CountVectorizer
from scipy.stats import randint

#Visualization
import seaborn as sns
import matplotlib.pyplot as plt

**Import CSV file in python and display data**

In [None]:
data = pd.read_csv('Data/data.csv', sep=';')
data.head()

**Display taregt values and count**

In [None]:
data['Target'].value_counts()

**Divide the data into input and outputs**

In [None]:
inputs = data.drop(['Target'], axis = 1)
target = data['Target']

**Split the data into training and testing data**

In [None]:
x_train, x_test, y_train, y_test = train_test_split(inputs,
                                                   target,
                                                   test_size = 0.2,
                                                   random_state = 365,
                                                   stratify = target)

**Encoding target labels**

In [None]:
enc_t = LabelEncoder()
y_train = enc_t.fit_transform(y_train)
y_test = enc_t.transform(y_test)
unique, counts = np.unique(y_train, return_counts=True)
dict(zip(unique, counts))

In [None]:
integer_mapping = {l: i for i, l in enumerate(enc_t.classes_)}
integer_mapping

In [None]:

SelectedClassifier = input("Select your classifier, KNN-Baseline or RandomForest")
print(SelectedClassifier)

**Using the test data to predict the labels. This will be used to find the performance of the model**

In [None]:
def run_classifier(x_train, x_test, y_train, y_test, SelectedClassifier):
    if SelectedClassifier == "KNN-Baseline":
        clf = KNeighborsClassifier(n_neighbors=5)
        clf.fit(x_train, y_train)
        y_pred = clf.predict(x_test)
    elif SelectedClassifier == "RandomForest":
        rf = RandomForestClassifier(random_state=365, ccp_alpha=0.001)
        rf.fit(x_train, y_train)
        y_pred = rf.predict(x_test)
    else:
        print("No Classifier Selected")
        return None

    precision = precision_score(y_test, y_pred, average="micro")
    recall = recall_score(y_test, y_pred, average="micro")
    f1 = f1_score(y_test, y_pred, average="micro")

    # Calculate micro-average
    micro_precision = precision
    micro_recall = recall
    micro_f1 = f1

    print("Micro-average Precision: {:.2f}".format(micro_precision))
    print("Micro-average Recall: {:.2f}".format(micro_recall))
    print("Micro-average F1-Score: {:.2f}".format(micro_f1))
    print("======================================")
    print("=======CLASSIFICATION SUMMARY=========")
    print(classification_report(y_test, y_pred))

In [None]:
# Resampling the minority class. The strategy can be changed as required.
sm = SMOTE(sampling_strategy='auto', k_neighbors=20, n_jobs=4, random_state=365)

x_resampled, y_resampled = sm.fit_resample(x_train, y_train)
unique, counts = np.unique(y_resampled_enc, return_counts=True)

classificationReport = run_classifier(x_resampled, x_test, y_resampled, y_test,SelectedClassifier=SelectedClassifier)
print(classificationReport)

**Hyperparameter Tuning**

In [None]:
def run_hyperparameters(x_train, y_train):
    ####### KNN
    # ##Define the parameter space for KNN
    # print(len(x_train.columns))
    if SelectedClassifier == "KNN-Baseline":
        param_dist_knn = {
            'n_neighbors': randint(36,37),  # Number of neighbors to consider
            'p': [1, 2],  # 1 for Manhattan distance (L1), 2 for Euclidean distance (L2)
            'weights': ['uniform', 'distance']  # Weighting strategy
        }
        # Create a KNN classifier
        knn = KNeighborsClassifier()
        # Use random search to find the best hyperparameters
        rand_search = RandomizedSearchCV(knn,
                                            param_distributions=param_dist_knn,
                                            n_iter=5,
                                            cv=5)

        # Fit the random search object to the data
        rand_search.fit(x_train, y_train)
    ######## RANDOM FOREST
    elif SelectedClassifier == "RandomForest":
        param_dist = {'n_estimators': randint(50,500), 'max_depth': randint(1,20)}

        # Create a random forest classifier
        rf = RandomForestClassifier()

        # # Use random search to find the best hyperparameters
        rand_search = RandomizedSearchCV(rf,
                                        param_distributions = param_dist,
                                        n_iter=5,
                                        cv=5)

        # Fit the random search object to the data
        rand_search.fit(x_train, y_train)
    else:
        print("No Classifier Selected")
        return None
    
    return rand_search


In [None]:
rand_search= run_hyperparameters(x_resampled,y_resampled)

**Classification report for different parameters**

In [None]:
# Create a variable for the best model
best_rf = rand_search.best_estimator_
print('Best parameters set found on development set: ')
print(rand_search.best_params_)
print()

means = rand_search.cv_results_['mean_test_score']
stds = rand_search.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, rand_search.cv_results_['params']):
    print('%0.3f (+/-%0.03f) for %r' % (mean, std * 2, params))
print()

print('Detailed classification report:')
print()
print('The model is trained on the full development set.')
print('The scores are computed on the full evaluation set.')
print()
y_true, y_pred = y_test, rand_search.predict(x_test)
precision = precision_score(y_test, y_pred, average="micro")
recall = recall_score(y_test, y_pred, average="micro")
f1 = f1_score(y_test, y_pred,average="micro")

# Calculate micro-average
micro_precision = precision.mean()
micro_recall = recall.mean()
micro_f1 = f1.mean()

print("Micro-average Precision: {:.2f}".format(micro_precision))
print("Micro-average Recall: {:.2f}".format(micro_recall))
print("Micro-average F1-Score: {:.2f}".format(micro_f1))
print("======================================")
print("=======CLASSIFICATION SUMMARY=========")
print(classification_report(y_test,y_pred))

**Finding important features in the dataset**

In [None]:
if SelectedClassifier =="KNN-Baseline":
    best_knn = rand_search.best_estimator_
    best_knn.fit(x_train, y_train)
    # Get the distance matrix from each data point to its k-nearest neighbors
    distances, indices = best_knn.kneighbors(x_train)
    # Analyze the feature relevance based on distances to neighbors
    # For example, you can calculate the average distance for each feature
    feature_relevance = distances.mean(axis=0)

    # Create a Series to visualize the feature relevance
    feature_relevance_series = pd.Series(feature_relevance, index=x_train.columns).sort_values(ascending =False)

    # Plot a bar chart for feature relevance
    plt.figure(figsize=(10, 6))
    feature_relevance_series.plot.bar()
    plt.title('Feature Relevance in KNN')
    plt.show()
elif SelectedClassifier == "RandomForest":
    #FOR RANDOM SEARCH
    # Create a series containing feature importances from the model and feature names from the training data
    feature_importances = pd.Series(best_rf.feature_importances_, index=x_train.columns).sort_values(ascending=False)
    # Plot a simple bar chart
    plt.figure(figsize=(10,6))
    feature_importances.plot.bar();
else:
    print("No Classifier selected")

for i in data.columns:
  sns.displot(data=data, x=i, height=4)
    