TASK 1

In [32]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

df_train = pd.read_csv(r'/content/datatraining.txt')
df_test = pd.read_csv(r'/content/datatest.txt')

# print(df_train.columns)
# print(df_test.columns)

features = ['Humidity','Light','HumidityRatio']

X_train = df_train[features]
y_train = df_train['Occupancy']

X_test = df_test[features]
y_test = df_test['Occupancy']

accuracies = []

for k in range(1, 11):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)
    print(f"K={k}, Accuracy={accuracy:.4f}")

best_accuracy = max(accuracies)
best_k = accuracies.index(best_accuracy) + 1

print("\nBest Accuracy:", round(best_accuracy, 4), "at K =", best_k)


K=1, Accuracy=0.9403
K=2, Accuracy=0.9253
K=3, Accuracy=0.9572
K=4, Accuracy=0.9448
K=5, Accuracy=0.9553
K=6, Accuracy=0.9527
K=7, Accuracy=0.9632
K=8, Accuracy=0.9614
K=9, Accuracy=0.9647
K=10, Accuracy=0.9647

Best Accuracy: 0.9647 at K = 9


TASK 2


In [34]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.metrics import confusion_matrix, accuracy_score

def chi_squared_distance(x1, x2, epsilon=1e-10):
    return np.sum((x1 - x2) ** 2 / (x1 + x2 + epsilon))

class KNNClassifier:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict_one(self, x):
        distances = [chi_squared_distance(x_train, x) for x_train in self.X_train]
        k_idx = np.argsort(distances)[:self.k]
        k_labels = [self.y_train[i] for i in k_idx]
        most_common = Counter(k_labels).most_common(1)
        return most_common[0][0]

    def predict(self, X):
        return [self.predict_one(x) for x in X]

iris = load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

clf = KNNClassifier(k=3)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {acc:.4f}")
print("Confusion Matrix:")
print(cm)


Accuracy: 0.9667
Confusion Matrix:
[[10  0  0]
 [ 0 10  0]
 [ 0  1  9]]


TASK 3

In [22]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.model_selection import train_test_split

df = pd.read_csv(r'/content/cancer patient data sets.csv')

df.columns

Index(['index', 'Patient Id', 'Age', 'Gender', 'Air Pollution', 'Alcohol use',
       'Dust Allergy', 'OccuPational Hazards', 'Genetic Risk',
       'chronic Lung Disease', 'Balanced Diet', 'Obesity', 'Smoking',
       'Passive Smoker', 'Chest Pain', 'Coughing of Blood', 'Fatigue',
       'Weight Loss', 'Shortness of Breath', 'Wheezing',
       'Swallowing Difficulty', 'Clubbing of Finger Nails', 'Frequent Cold',
       'Dry Cough', 'Snoring', 'Level'],
      dtype='object')

In [23]:
print(df.describe)
print(df['Level'].value_counts())

<bound method NDFrame.describe of      index Patient Id  Age  Gender  Air Pollution  Alcohol use  Dust Allergy  \
0        0         P1   33       1              2            4             5   
1        1        P10   17       1              3            1             5   
2        2       P100   35       1              4            5             6   
3        3      P1000   37       1              7            7             7   
4        4       P101   46       1              6            8             7   
..     ...        ...  ...     ...            ...          ...           ...   
995    995       P995   44       1              6            7             7   
996    996       P996   37       2              6            8             7   
997    997       P997   25       2              4            5             6   
998    998       P998   18       2              6            8             7   
999    999       P999   47       1              6            5             6   

     

In [24]:
print("\nMissing values:\n",df.isnull().sum())
print("\nCategorical features:",df.select_dtypes(include=['object','category']).columns.tolist())
print("\nNumber of duplicates:", df.duplicated().sum())


Missing values:
 index                       0
Patient Id                  0
Age                         0
Gender                      0
Air Pollution               0
Alcohol use                 0
Dust Allergy                0
OccuPational Hazards        0
Genetic Risk                0
chronic Lung Disease        0
Balanced Diet               0
Obesity                     0
Smoking                     0
Passive Smoker              0
Chest Pain                  0
Coughing of Blood           0
Fatigue                     0
Weight Loss                 0
Shortness of Breath         0
Wheezing                    0
Swallowing Difficulty       0
Clubbing of Finger Nails    0
Frequent Cold               0
Dry Cough                   0
Snoring                     0
Level                       0
dtype: int64

Categorical features: ['Patient Id', 'Level']

Number of duplicates: 0


In [25]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

df_encoded = df.copy()

df_encoded = df_encoded.drop(columns=['index', 'Patient Id'])

for col in df_encoded.columns:
    if df_encoded[col].dtype == 'object':
        le = LabelEncoder()
        df_encoded[col] = le.fit_transform(df_encoded[col].astype(str))

corr_matrix = df_encoded.corr(method='pearson')

print("\nCorrelation with target (Level):")
print(corr_matrix['Level'].sort_values(ascending=False))

selected_features = corr_matrix['Level'].drop('Level').abs()
selected_features = selected_features[selected_features > 0.2].index.tolist()
print("\nSelected features:", selected_features)

X = df_encoded[selected_features]
y = df_encoded['Level']


Correlation with target (Level):
Level                       1.000000
Wheezing                    0.167773
Clubbing of Finger Nails    0.116746
Gender                      0.086222
Age                         0.042631
Snoring                     0.014280
Swallowing Difficulty      -0.012880
Weight Loss                -0.020537
Shortness of Breath        -0.140178
Frequent Cold              -0.171678
Dry Cough                  -0.228720
Dust Allergy               -0.264926
OccuPational Hazards       -0.363748
Fatigue                    -0.403276
Genetic Risk               -0.423382
chronic Lung Disease       -0.432405
Alcohol use                -0.434071
Chest Pain                 -0.494704
Obesity                    -0.561961
Air Pollution              -0.577269
Smoking                    -0.611087
Balanced Diet              -0.618781
Coughing of Blood          -0.631118
Passive Smoker             -0.638409
Name: Level, dtype: float64

Selected features: ['Air Pollution', 'Alcohol use

In [26]:
scaler = StandardScaler()

random_state = 0
X_train_val,X_test,y_train_val,y_test = train_test_split(X,y,test_size=0.2,random_state=random_state, stratify=y)


X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.30, random_state=random_state, stratify=y_train_val
)

print("Train:", X_train.shape, "Val:", X_val.shape, "Test:", X_test.shape)

scaler.fit(X_train)
X_train_s = scaler.transform(X_train)
X_val_s = scaler.transform(X_val)
X_test_s = scaler.transform(X_test)



Train: (560, 14) Val: (240, 14) Test: (200, 14)


In [27]:
metrics = [
    ('euclidean', {'metric':'minkowski','p':2}),
    ('manhattan', {'metric':'minkowski','p':1}),
    ('chebyshev', {'metric':'chebyshev'})
]
results = []

for name, params in metrics:
    if params['metric'] == 'minkowski':
        knn = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=params['p'])
    else:
        knn = KNeighborsClassifier(n_neighbors=5, metric=params['metric'])

    knn.fit(X_train_s, y_train)

    acc_train = accuracy_score(y_train, knn.predict(X_train_s))
    acc_val = accuracy_score(y_val, knn.predict(X_val_s))
    acc_test = accuracy_score(y_test, knn.predict(X_test_s))

    cm_test = confusion_matrix(y_test, knn.predict(X_test_s))

    results.append({
        'Metric': name,
        'Train Acc': acc_train,
        'Val Acc': acc_val,
        'Test Acc': acc_test,
        'Confusion Matrix': cm_test
    })

In [28]:
res_df = pd.DataFrame(results).drop(columns=['Confusion Matrix'])
print("\nAccuracy Comparison:")
print(res_df)

for r in results:
    print(f"\nConfusion Matrix ({r['Metric']}):\n", r['Confusion Matrix'])

best = max(results, key=lambda x: x['Val Acc'])
print(f"\nBest metric by validation accuracy: {best['Metric']}")

if best['Metric'] == 'euclidean':
    best_knn = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
elif best['Metric'] == 'manhattan':
    best_knn = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=1)
else:
    best_knn = KNeighborsClassifier(n_neighbors=5, metric='chebyshev')

best_knn.fit(X_train_s, y_train)
y_test_pred = best_knn.predict(X_test_s)

print("\nClassification Report:")
print(classification_report(y_test, y_test_pred))


Accuracy Comparison:
      Metric  Train Acc  Val Acc  Test Acc
0  euclidean        1.0      1.0       1.0
1  manhattan        1.0      1.0       1.0
2  chebyshev        1.0      1.0       1.0

Confusion Matrix (euclidean):
 [[73  0  0]
 [ 0 61  0]
 [ 0  0 66]]

Confusion Matrix (manhattan):
 [[73  0  0]
 [ 0 61  0]
 [ 0  0 66]]

Confusion Matrix (chebyshev):
 [[73  0  0]
 [ 0 61  0]
 [ 0  0 66]]

Best metric by validation accuracy: euclidean

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        73
           1       1.00      1.00      1.00        61
           2       1.00      1.00      1.00        66

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200

