In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Assume you have a DataFrame 'df' with your dataset
# Load your dataset
df = pd.read_csv('C:\\Users\\SACHIN.R\\OneDrive\\Desktop\\resume_data.csv')

# A1: Evaluate intraclass spread and interclass distances
class1_data = df[df['Category'] == 'Age']
class2_data = df[df['Category'] == 'Years Of Experience']

mean_class1 = np.mean(class1_data.drop('Category', axis=1), axis=0)
mean_class2 = np.mean(class2_data.drop('Category', axis=1), axis=0)

std_class1 = np.std(class1_data.drop('Category', axis=1), axis=0)
std_class2 = np.std(class2_data.drop('Category', axis=1), axis=0)

distance_between_means = np.linalg.norm(mean_class1 - mean_class2)

# A2: Density pattern for a feature using histogram
feature_to_plot = 'Age'
plt.hist(df[feature_to_plot], bins=10, color='blue', edgecolor='black')
plt.xlabel(feature_to_plot)
plt.ylabel('Frequency')
plt.title('Histogram for ' + feature_to_plot)
plt.show()

mean_feature = np.mean(df[feature_to_plot])
variance_feature = np.var(df[feature_to_plot])

# A3: Minkowski distance with varying r from 1 to 10
feature_vector1 = df.iloc[0].drop('Category')
feature_vector2 = df.iloc[1].drop('Category')

r_values = range(1, 11)
minkowski_distances = [np.linalg.norm(feature_vector1 - feature_vector2, ord=r) for r in r_values]

plt.plot(r_values, minkowski_distances, marker='o')
plt.xlabel('r values')
plt.ylabel('Minkowski Distance')
plt.title('Minkowski Distance vs r')
plt.show()

# A4: Train-test split
X = df.drop('Category', axis=1)
y = df['Category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
# Select only numeric columns for scaling
numeric_columns = X_train.select_dtypes(include=np.number).columns

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[numeric_columns])
X_test_scaled = scaler.transform(X_test[numeric_columns])

# A5: Train kNN classifier (k = 3)
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_train, y_train)

# A6: Test accuracy
accuracy = neigh.score(X_test, y_test)

# A7: Predict using test set
predictions = neigh.predict(X_test)

# A8: Compare kNN (k = 3) with NN (k = 1) by varying k
k_values = range(1, 12)
accuracy_scores = []

for k in k_values:
    neigh_k = KNeighborsClassifier(n_neighbors=k)
    neigh_k.fit(X_train, y_train)
    accuracy_k = neigh_k.score(X_test, y_test)
    accuracy_scores.append(accuracy_k)

plt.plot(k_values, accuracy_scores, marker='o')
plt.xlabel('k values')
plt.ylabel('Accuracy')
plt.title('Accuracy vs k for kNN')
plt.show()

# A9: Evaluate confusion matrix and other performance metrics
conf_matrix = confusion_matrix(y_test, predictions)
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

print('Confusion Matrix:')
print(conf_matrix)

print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1)
