In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Load data from Excel file
file_path = r"C:\Users\vignesh\Desktop\sem1\math\Project_5\us-violence-brief-1.xls"
data = pd.read_excel(file_path)

# Preprocessing data
# Convert dates to numerical format
data['date'] = pd.to_datetime(data[['year', 'month', 'day']])
data.drop(['year', 'month', 'day'], axis=1, inplace=True)

# Scale latitude and longitude if necessary
scaler = StandardScaler()
data[['latitude', 'longitude']] = scaler.fit_transform(data[['latitude', 'longitude']])

# Convert datetime to timestamp (int64)
data['timestamp'] = data['date'].astype('int64') // 10**9  # Converting nanoseconds to seconds

# Split data into training and testing sets
train_data = data[data['date'].dt.year < 2023]
test_data = data[data['date'].dt.year == 2023]

# Define distance function (Euclidean distance)
def euclidean_distance(x1, x2):
    numerical_x1 = np.array([x1['latitude'], x1['longitude'], x1['timestamp']])
    numerical_x2 = np.array([x2['latitude'], x2['longitude'], x2['timestamp']])
    return np.sqrt(np.sum((numerical_x1 - numerical_x2) ** 2))

# Implement KNN algorithm
def knn(train_data, test_data, k):
    predictions = []
    for i in range(len(test_data)):
        distances = []
        for j in range(len(train_data)):
            dist = euclidean_distance(test_data.iloc[i], train_data.iloc[j])
            if dist >= 0:  # Ensure non-negative distances
                distances.append((dist, train_data.iloc[j, 0]))
        distances.sort(key=lambda x: x[0])
        neighbors = distances[:k]
        counts = np.zeros(2)  # Initialize counts array with zeros
        for neighbor in neighbors:
            if neighbor[0] >= 0:
                counts[int(neighbor[1])] += 1
        prediction = np.argmax(counts)
        predictions.append(prediction)
    return predictions

# Choose odd values of k for KNN
k_values = [1, 3, 5, 7, 9]

# Evaluate KNN for each k value
for k in k_values:
    predictions = knn(train_data, test_data, k)
    accuracy = accuracy_score(test_data['fatalities'], predictions)
    print(f"Accuracy for k={k}: {accuracy}")

    # Calculate the fraction of correctly classified 2023 locations
    total_2023_data_points = len(test_data)
    correctly_classified = sum(predictions == test_data['fatalities'])
    fraction_correctly_classified = correctly_classified / total_2023_data_points
    print(f"Fraction of correctly classified 2023 locations for k={k}: {fraction_correctly_classified}")

Accuracy for k=1: 0.8341013824884793
Fraction of correctly classified 2023 locations for k=1: 0.8341013824884793
Accuracy for k=3: 0.8341013824884793
Fraction of correctly classified 2023 locations for k=3: 0.8341013824884793
Accuracy for k=5: 0.8341013824884793
Fraction of correctly classified 2023 locations for k=5: 0.8341013824884793
Accuracy for k=7: 0.8341013824884793
Fraction of correctly classified 2023 locations for k=7: 0.8341013824884793
Accuracy for k=9: 0.8341013824884793
Fraction of correctly classified 2023 locations for k=9: 0.8341013824884793
