<a id='train-test-scratch'></a>
<hr/>

## Assignments

### 1. train test split from scratch

Create a function my_train_test_split() that takes ipnput X, y and fraction of train. And ouputs the list or tuple containing splits

In [None]:
#Clue 1: Splitting the data sequentially for a given fraction

data = np.array([[1, 2, 0], [3, 4, 1], [5, 6, 1], [7, 8, 0], [9, 10, 1], [11, 12, 0]])
print('data:')
print(data)

# Train part of the data
split = int(0.8*data.shape[0])
X_train = data[:split, :-1]
y_train = data[:split, -1]

print('\nX_train:')
print(X_train)
print('\ny_train:')
print(y_train)

# The test part of the data
X_test = data[split:, :-1]
y_test = data[split:, -1]

print('\ny_train:')
print(X_test)

print('\ny_test:')
print(y_test)

In [None]:
#Clue2: splitting data randomly
data = np.array([[1, 2, 0], [3, 4, 1], [5, 6, 1], [7, 8, 0]])
num_samples = data.shape[0]
ind = np.random.choice(num_samples, num_samples, replace = False)
print(ind)
print(type(ind))
split = int(0.8*num_samples)
print(split)
ind[:split]

In [None]:
import random
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris

In [None]:
## TODO: Your function definition goes here
def my_train_test_split(X,y,test_size,random_state):
    if random_state:
        random.seed(random_state)
    data = list(zip(X,y))
    np.random.shuffle(data)

    split = int((1-test_size)*len(data))

    train_data = data[:split]
    test_data = data[split:]
    
    X_train, y_train = zip(*train_data)
    X_test, y_test = zip(*test_data)

    return np.array(X_train), np.array(X_test), np.array(y_train), np.array(y_test)

In [None]:
## TODO: Your function invocation goes here
iris = load_iris()
X = iris.data
y = iris.target


X_train , y_train , X_test , y_test = my_train_test_split(X,y,0.2,random_state=42)
print(X_train)
print(y_train)
print(X_test)
print(y_test)

<a id='knn-scratch'></a>
<hr/>

### 2. kNN from scratch

In [None]:
# KNN class that allows setting the number of neighbours and weight=uniform or distance
class KNN:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def predict(self, X_test):
        predictions = []
        for test_sample in X_test:
            distances = [np.linalg.norm(test_sample - train_sample) for train_sample in self.X_train]
            k_indices = np.argsort(distances)[:self.k]
            k_nearest_labels = [self.y_train[i] for i in k_indices]
            most_common_label = max(set(k_nearest_labels), key=k_nearest_labels.count)
            predictions.append(most_common_label)
        return np.array(predictions)

In [None]:
X_train, X_test, y_train, y_test = my_train_test_split(X, y, test_size=0.2, random_state=42)

knn = KNN(k=3)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

print("Predictions:", predictions)
acc = accuracy_score(predictions,y_test)
print(f"Accuracy = {acc}")

<a id='gridsearch-scratch'></a>
<hr/>

### 3. GridSearch from scratch

In [None]:
# Clue: Look at itertools.product() functionality in python.
# It will allow you to create Cartesian products needed for multiple hyperparam tuning
# Use it in a loop to write your custom Grid Search

In [None]:
def my_grid_search(X_train,y_train,X_test,param_grid,k_values):
    best_params=None
    best_accuracy=-1

    for k in k_values:
        knn = KNN(k=k)
        knn.fit(X_train,y_train)
        predictions = knn.predict(X_test)
        accuracy = (predictions==y_test).mean()

        if accuracy > best_accuracy:
           best_accuracy = accuracy
           best_params = {'n_neighbors': k} 
        
    return best_params, best_accuracy

In [None]:
X_train, X_test, y_train, y_test = my_train_test_split(X, y, test_size=0.2, random_state=42)


param_grid = {'n_neighbors': [1, 3, 5, 7, 9]}
k_values = [1, 3, 5, 7, 9]


best_params, best_accuracy = my_grid_search(X_train, y_train, X_test, param_grid, k_values)

print("Best K value found:", best_params['n_neighbors'])
print("Accuracy of the best model:", best_accuracy)

<a id='integrate'></a>
<hr/>

### 4. Integrate your custom code

1. Create a dataframe of iris dataset
2. Use your custom train test split function to split into train and test
3. Use your custom GridSearch on your customKNN class to identify the best k and best weight for iris dataset

In [None]:
def my_train_test_split(X, y, test_size=0.2, random_state=None):
    if random_state:
        np.random.seed(random_state)

    data = list(zip(X, y))
    np.random.shuffle(data)
    
    split_index = int(len(data) * (1 - test_size))
    train_data = data[:split_index]
    test_data = data[split_index:]

    X_train, y_train = zip(*train_data)
    X_test, y_test = zip(*test_data)

    return np.array(X_train), np.array(X_test), np.array(y_train), np.array(y_test)

class KNN:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def predict(self, X_test):
        predictions = []
        for test_sample in X_test:
            distances = [np.linalg.norm(test_sample - train_sample) for train_sample in self.X_train]
            k_indices = np.argsort(distances)[:self.k]
            k_nearest_labels = [self.y_train[i] for i in k_indices]
            most_common_label = max(set(k_nearest_labels), key=k_nearest_labels.count)
            predictions.append(most_common_label)
        return np.array(predictions)
    
    def my_grid_search(X_train,y_train,X_test,param_grid,k_values):
        best_params=None
        best_accuracy=-1

        for k in k_values:
            knn = KNN(k=k)
            knn.fit(X_train,y_train)
            predictions = knn.predict(X_test)
            accuracy = (predictions==y_test).mean()

            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_params = {'n_neighbors': k} 
            
        return best_params, best_accuracy
iris = load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = my_train_test_split(X, y, test_size=0.2, random_state=42)

param_grid = {'n_neighbors': [1, 3, 5, 7, 9]}
k_values = [1, 3, 5, 7, 9]

best_params, best_accuracy = my_grid_search(X_train, y_train, X_test, param_grid, k_values)

print("Best K value found:", best_params['n_neighbors'])
print("Accuracy of the best model:", best_accuracy)
