# Hand Made K nearest neighbour


This code implements a k-Nearest Neighbors (k-NN) model to classify movies as "good" or "bad" based on their IMDB rating, using features like release year, runtime, meta score, and number of votes. After preprocessing the data (handling missing values, converting types, and splitting into training and validation sets), the model calculates the distance between movies and assigns a label based on the majority label among the k closest neighbors. The output, Validation Accuracy: 1.00, indicates that the model achieved perfect accuracy on the validation set, correctly classifying all movies.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('imbd.csv')

In [3]:
print(df.head())
print(df.columns)
print(df.info())

                                         Poster_Link  \
0  https://m.media-amazon.com/images/M/MV5BMDFkYT...   
1  https://m.media-amazon.com/images/M/MV5BM2MyNj...   
2  https://m.media-amazon.com/images/M/MV5BMTMxNT...   
3  https://m.media-amazon.com/images/M/MV5BMWMwMG...   
4  https://m.media-amazon.com/images/M/MV5BMWU4N2...   

               Series_Title Released_Year Certificate  Runtime  \
0  The Shawshank Redemption          1994           A  142 min   
1             The Godfather          1972           A  175 min   
2           The Dark Knight          2008          UA  152 min   
3    The Godfather: Part II          1974           A  202 min   
4              12 Angry Men          1957           U   96 min   

                  Genre  IMDB_Rating  \
0                 Drama          9.3   
1          Crime, Drama          9.2   
2  Action, Crime, Drama          9.0   
3          Crime, Drama          9.0   
4          Crime, Drama          9.0   

                         

In [97]:
missing_values = df.isnull().sum()
print(missing_values)

Poster_Link        0
Series_Title       0
Released_Year      0
Certificate      101
Runtime            0
Genre              0
IMDB_Rating        0
Overview           0
Meta_score       157
Director           0
Star1              0
Star2              0
Star3              0
Star4              0
No_of_Votes        0
Gross            169
dtype: int64


In [98]:
print(df.Released_Year.dtype)
print(df.Released_Year.head())

object
0    1994
1    1972
2    2008
3    1974
4    1957
Name: Released_Year, dtype: object


In [99]:
print(df['Released_Year'].unique())
pg_count = (df['Released_Year'] == 'PG').sum()
print(pg_count)

df = df[df.Released_Year!='PG']
df['Released_Year'] = df['Released_Year'].astype(int)


['1994' '1972' '2008' '1974' '1957' '2003' '1993' '2010' '1999' '2001'
 '1966' '2002' '1990' '1980' '1975' '2020' '2019' '2014' '1998' '1997'
 '1995' '1991' '1977' '1962' '1954' '1946' '2011' '2006' '2000' '1988'
 '1985' '1968' '1960' '1942' '1936' '1931' '2018' '2017' '2016' '2012'
 '2009' '2007' '1984' '1981' '1979' '1971' '1963' '1964' '1950' '1940'
 '2013' '2005' '2004' '1992' '1987' '1986' '1983' '1976' '1973' '1965'
 '1959' '1958' '1952' '1948' '1944' '1941' '1927' '1921' '2015' '1996'
 '1989' '1978' '1961' '1955' '1953' '1925' '1924' '1982' '1967' '1951'
 '1949' '1939' '1937' '1934' '1928' '1926' '1920' '1970' '1969' '1956'
 '1947' '1945' '1930' '1938' '1935' '1933' '1932' '1922' '1943' 'PG']
1


In [100]:
df = df.dropna(subset=['Meta_score', 'No_of_Votes', 'Gross'])

In [101]:
df['label'] = df['IMDB_Rating'].apply(lambda x: 1 if x >= 7.0 else 0)

df['Runtime'] = df['Runtime'].str.replace(" min", "").astype(float)

features = df[['Released_Year', 'Runtime', 'Meta_score', 'No_of_Votes']]

labels = df['label']


In [102]:
training_set, validation_set, training_labels, validation_labels = train_test_split(
    features, labels, test_size=0.2, random_state=42
)

In [103]:
training_set = training_set.to_dict(orient='index')
validation_set = validation_set.to_dict(orient='index')
training_labels = training_labels.to_dict()
validation_labels = validation_labels.to_dict()

In [104]:
def distance(movie1, movie2):
    squared_difference = sum((movie1[key] - movie2[key]) ** 2 for key in movie1)
    return squared_difference ** 0.5

def classify(unknown, dataset, labels, k):
    distances = []
    for title, movie in dataset.items():
        distance_to_point = distance(movie, unknown)
        distances.append([distance_to_point, title])
    distances.sort(key=lambda x: x[0])
    neighbors = distances[:k]
    num_good = sum(1 for neighbor in neighbors if labels[neighbor[1]] == 1)
    num_bad = k - num_good
    return 1 if num_good > num_bad else 0

def find_validation_accuracy(training_set, training_labels, validation_set, validation_labels, k):
    num_correct = 0
    for movie in validation_set:
        predicted = classify(validation_set[movie], training_set, training_labels, k)
        actual = validation_labels[movie]
        if predicted == actual:
            num_correct += 1
    return num_correct / len(validation_set)

# Running the updated code
k = 3  # Number of neighbors
accuracy = find_validation_accuracy(training_set, training_labels, validation_set, validation_labels, k)
print(f"Validation Accuracy: {accuracy:.2f}")


Validation Accuracy: 1.00
