In [4]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, cross_val_score
import pickle

# Load the dataset
df = pd.read_csv('student_data.csv')

# Separate the features and target variable
X = df[['GRE Score', 'TOEFL Score', 'SOP', 'LOR', 'CGPA', 'Research']]
y = df['Chance of Admit']

# Scale the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Define the k-NN Regression model
knn = KNeighborsRegressor()

# Define the hyperparameters to tune
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'algorithm': ['ball_tree', 'kd_tree', 'brute']
}

# Perform grid search to find the best hyperparameters
grid_search = GridSearchCV(knn, param_grid=param_grid, cv=10)
grid_search.fit(X, y)
knn = grid_search.best_estimator_

# Evaluate the performance of the model using cross-validation
scores = cross_val_score(knn, X, y, cv=10)
print('Cross-validation scores:', scores)
print('Mean score:', np.mean(scores))

# Generate predictions for all institutions
df['Predicted Chance of Admit'] = knn.predict(X)

# Group by institution and sort by predicted chance of admit
df_grouped = df.groupby('Institution').mean().sort_values(by='Predicted Chance of Admit', ascending=False)

# Print the top 5 institutions and their respective chances of admission
top_institutions = df_grouped.index.values[:5]
for institution in top_institutions:
    chance_of_admit = df_grouped.loc[institution]['Predicted Chance of Admit']
    print(f"Institution: {institution}, Chance of Admit: {chance_of_admit}")

# Calculate the R2 score
r2 = knn.score(X, y)
print('R2 score:', r2)

Cross-validation scores: [1.         1.         1.         1.         0.93785989 0.90895003
 1.         1.         1.         1.        ]
Mean score: 0.9846809920050206
Institution: University Of Arizona, Chance of Admit: 0.8457142857142858
Institution: Tufts University, Chance of Admit: 0.8328571428571429
Institution: Princeton University, Chance of Admit: 0.8242857142857142
Institution: Northeastern University, Chance of Admit: 0.8071428571428572
Institution: UIUC, Chance of Admit: 0.8
R2 score: 1.0


In [5]:
# Save the model to disk
filename = 'kNNr_model.pkl'
with open(filename, 'wb') as file:
    pickle.dump(knn, file)

In [10]:
# Load the trained KNN model from disk
filename = 'kNNr_model.pkl'
with open(filename, 'rb') as file:
    knn = pickle.load(file)

# Load the dataset used for training the model
df_train = pd.read_csv('3.csv')

# Separate the features and target variable
X_train = df_train[['GRE Score', 'TOEFL Score', 'SOP', 'LOR', 'CGPA', 'Research']]
y_train = df_train['Chance of Admit']

# Fit a StandardScaler instance to the training data and use it to scale the user input
scaler = StandardScaler()
scaler.fit(X_train)

# Prompt the user to enter values for the input features
user_input = []
print("Enter the values of the following features:")
for feature in X_train.columns:
    value = input(f"{feature}: ")
    user_input.append(float(value))
user_input = np.array(user_input).reshape(1, -1)
user_input = scaler.transform(user_input)

# Use the KNN model to make predictions on the user input
predicted_chance_of_admit = knn.predict(user_input)[0]
print(f"Predicted chance of admission: {predicted_chance_of_admit:.2f}")

# Group by institution and sort by predicted chance of admit
df_grouped = df_train.groupby('Institution').mean().sort_values(by='Chance of Admit', ascending=False)

# Print the top 5 institutions that have predicted chances of admission less than the user input
top_institutions = df_grouped[df_grouped['Chance of Admit'] < predicted_chance_of_admit].index.values[:5]
if len(top_institutions) == 0:
    print("No institutions found with predicted chance of admission less than the user input.")
else:
    print(f"\nTop {len(top_institutions)} institutions that have predicted chances of admission less than {predicted_chance_of_admit:.2f}:")
    for institution in top_institutions:
        chance_of_admit = df_grouped.loc[institution]['Chance of Admit']
        print(f"Institution: {institution}, Chance of Admit: {chance_of_admit:.2f}")


Enter the values of the following features:
Predicted chance of admission: 0.94

Top 5 institutions that have predicted chances of admission less than 0.94:
Institution: University Of Arizona, Chance of Admit: 0.85
Institution: Tufts University, Chance of Admit: 0.83
Institution: Princeton University, Chance of Admit: 0.82
Institution: Northeastern University, Chance of Admit: 0.81
Institution: UIUC, Chance of Admit: 0.80


