In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [None]:
data=pd.read_csv("/content/sample_data/adult.csv")
data.head(15)

In [None]:
# Display basic info about the dataset
data.info()

In [None]:
# Check for missing values (represented as '?')
for col in data.columns:
    if data[col].dtype == 'object':
        print(f"Column '{col}': {data[col].isin(['?']).sum()} '?' values")

In [None]:
# Handle '?' in 'workclass' and 'occupation'
data['workclass'] = data['workclass'].replace('?', 'Others')
data['occupation'] = data['occupation'].replace('?', 'Others')
data['native-country'] = data['native-country'].replace('?', 'Others')

# Remove specific 'workclass' types as per previous analysis
data = data[data['workclass'] != 'Without-pay']
data = data[data['workclass'] != 'Never-worked']

# Verify changes
print("Unique workclass values after cleaning:", data['workclass'].unique())
print("Unique occupation values after cleaning:", data['occupation'].unique())
print("Unique native-country values after cleaning:", data['native-country'].unique())

In [None]:
# Drop 'fnlwgt' column as it's often not relevant for prediction
data = data.drop('fnlwgt', axis=1)

In [None]:
# Encode categorical features
categorical_cols = data.select_dtypes(include='object').columns.tolist()
categorical_cols.remove('income') # 'income' is the target variable

data_encoded = pd.get_dummies(data, columns=categorical_cols, drop_first=True)

# Encode the target variable 'income'
le = LabelEncoder()
data_encoded['income'] = le.fit_transform(data_encoded['income'])

data_encoded.head()

In [None]:
# Define features (X) and target (y)
X = data_encoded.drop('income', axis=1)
y = data_encoded['income']

# Split data into training and testing sets
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Initialize and train the KNN model
knn = KNeighborsClassifier(n_neighbors=5) # You can experiment with n_neighbors
knn.fit(xtrain, ytrain)

In [None]:
# Make predictions on the test set
predict = knn.predict(xtest)

# Evaluate the model accuracy
from sklearn.metrics import accuracy_score
accuracy_score(ytest, predict)

0.8251268710323984

In [None]:
# from sklearn.model_selection import GridSearchCV
# knn=KNeighborsClassifier(n_jobs=-1)
# param_grid={
#     'n_neighbors':[3,5,7],
#     'weights':['uniform', 'distance'],
#     'algorithm':['auto', 'ball_tree', 'kd_tree', 'brute'],
# }
# gsv=GridSearchCV(knn,param_grid,n_jobs=-1,refit=True)
# gsv.fit(xtrain,ytrain)
# prediction=gsv.predict(xtest)
# print(gsv.best_params_)
# print(gsv.best_score_)
# print(accuracy_score(ytest,prediction))