<a href="https://colab.research.google.com/github/ShawonTech/Machine-Learning-Lab./blob/main/shawonjoss.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import numpy as np
import pandas as pd
from collections import Counter

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier

In [2]:
class CustomKNN:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def predict(self, X_test):
        predictions = []
        for x in X_test:
            distances = np.linalg.norm(self.X_train - x, axis=1)
            k_indices = np.argsort(distances)[:self.k]
            k_nearest_labels = self.y_train[k_indices]
            most_common = Counter(k_nearest_labels).most_common(1)[0][0]
            predictions.append(most_common)
        return np.array(predictions)

In [4]:
dataset = pd.read_csv('/content/drive/MyDrive/news_dataset.csv')
display(dataset.head())

Unnamed: 0,text,category
0,Team wins championship,sports
1,Election results announced,politics
2,Player scores hat-trick,sports
3,New policy debate,politics
4,Match postponed due to rain,sports


In [5]:
def accuracy(y_true, y_pred):
    return np.sum(y_true == y_pred) / len(y_true)

def confusion_matrix(y_true, y_pred):
    unique_labels = np.unique(np.concatenate((y_true, y_pred)))
    cm = np.zeros((len(unique_labels), len(unique_labels)), dtype=int)
    for i, true_label in enumerate(unique_labels):
        for j, pred_label in enumerate(unique_labels):
            cm[i, j] = np.sum((y_true == true_label) & (y_pred == pred_label))
    return cm, unique_labels

def precision_recall_f1(y_true, y_pred):
    cm, labels = confusion_matrix(y_true, y_pred)
    precisions, recalls, f1s = [], [], []
    for i in range(len(labels)):
        TP = cm[i, i]
        FP = np.sum(cm[:, i]) - TP
        FN = np.sum(cm[i, :]) - TP
        precision = TP / (TP + FP) if (TP + FP) > 0 else 0
        recall = TP / (TP + FN) if (TP + FN) > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
    return np.mean(precisions), np.mean(recalls), np.mean(f1s)

In [6]:
y_true = np.array([0, 1, 2, 2, 0, 1, 2])
y_pred = np.array([0, 2, 1, 2, 0, 0, 2])

acc = accuracy(y_true, y_pred)
cm, labels = confusion_matrix(y_true, y_pred)
precision, recall, f1 = precision_recall_f1(y_true, y_pred)

print("Accuracy:", acc)
print("Confusion Matrix:\n", cm)
print("Labels:", labels)
print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1-score: {f1:.2f}")

Accuracy: 0.5714285714285714
Confusion Matrix:
 [[2 0 0]
 [1 0 1]
 [0 1 2]]
Labels: [0 1 2]
Precision: 0.44, Recall: 0.56, F1-score: 0.49


In [7]:
def find_best_k_split(X, y, k_values, split_ratios):
    best_score = 0
    best_k = None
    best_split = None
    for k in k_values:
        for split in split_ratios:
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split, random_state=42)
            model = CustomKNN(k=k)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            score = accuracy(y_test, y_pred)
            if score > best_score:
                best_score = score
                best_k = k
                best_split = split
    return best_k, best_split, best_score

In [8]:
iris = load_iris()
X_iris = iris.data
y_iris = iris.target

k_values = range(1, 16)
split_ratios = [0.2, 0.25, 0.3]

best_k_iris, best_split_iris, best_score_iris = find_best_k_split(X_iris, y_iris, k_values, split_ratios)
print(f"Iris Best k: {best_k_iris}, Best split: {best_split_iris}, Accuracy: {best_score_iris:.2f}")

X_train, X_test, y_train, y_test = train_test_split(X_iris, y_iris, test_size=best_split_iris, random_state=42)
custom_knn = CustomKNN(k=best_k_iris)
custom_knn.fit(X_train, y_train)
y_pred = custom_knn.predict(X_test)

acc = accuracy(y_test, y_pred)
cm, labels = confusion_matrix(y_test, y_pred)
precision, recall, f1 = precision_recall_f1(y_test, y_pred)

print("\nCustom KNN Iris Metrics:")
print(f"Accuracy: {acc:.2f}")
print(f"Confusion Matrix:\n{cm}")
print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1-Score: {f1:.2f}")

Iris Best k: 1, Best split: 0.2, Accuracy: 1.00

Custom KNN Iris Metrics:
Accuracy: 1.00
Confusion Matrix:
[[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]
Precision: 1.00, Recall: 1.00, F1-Score: 1.00


In [9]:
data = {
    'text': [
        'Team wins championship', 'Election results announced', 'Player scores hat-trick', 'New policy debate',
        'Match postponed due to rain', 'Government passes new bill', 'Coach resigns after loss', 'Parliament in session',
        'Fans celebrate victory', 'Budget discussion heats up'
    ]*10,
    'category': ['sports', 'politics', 'sports', 'politics', 'sports', 'politics', 'sports', 'politics', 'sports', 'politics'] * 10
}
news_df = pd.DataFrame(data)

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
news_df['category_encoded'] = label_encoder.fit_transform(news_df['category'])

tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_news = tfidf_vectorizer.fit_transform(news_df['text']).toarray()
y_news = news_df['category_encoded'].values

In [10]:
k_values_news = range(1, 16)
split_ratios_news = [0.2, 0.25, 0.3]

best_k_news, best_split_news, best_score_news = find_best_k_split(X_news, y_news, k_values_news, split_ratios_news)
print(f"News Best k: {best_k_news}, Best split: {best_split_news}, Accuracy: {best_score_news:.2f}")

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

X_train_news, X_test_news, y_train_news, y_test_news = train_test_split(X_news, y_news, test_size=best_split_news, random_state=42)

sklearn_knn_news = KNeighborsClassifier(n_neighbors=best_k_news)
sklearn_knn_news.fit(X_train_news, y_train_news)
y_pred_sklearn_news = sklearn_knn_news.predict(X_test_news)

acc_sk_news = accuracy(y_test_news, y_pred_sklearn_news)
precision_sk_news, recall_sk_news, f1_sk_news = precision_recall_f1(y_test_news, y_pred_sklearn_news)

print("\nScikit-learn KNN News Metrics:")
print(f"Accuracy: {acc_sk_news:.2f}")
print(f"Precision: {precision_sk_news:.2f}, Recall: {recall_sk_news:.2f}, F1-Score: {f1_sk_news:.2f}")

News Best k: 1, Best split: 0.2, Accuracy: 1.00

Scikit-learn KNN News Metrics:
Accuracy: 1.00
Precision: 1.00, Recall: 1.00, F1-Score: 1.00
