In [140]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error, mean_absolute_error
import math
import warnings
warnings.filterwarnings("ignore")

In [141]:
df = pd.read_csv('Copy of MHTCET_Colleges_Finall_sheet(2).csv')

In [142]:
df.head()

Unnamed: 0,College_Code,College_Name,Branch_Code,Branch_Name,Cutoff_Merit_No,Cutoff_Percentile,Location
0,1002,"Government College of Engineering, Amravati",100219110,Civil Engineering,39713,88.248475,Amravati
1,1002,"Government College of Engineering, Amravati",100219110,Civil Engineering,41066,87.915931,Amravati
2,1002,"Government College of Engineering, Amravati",100219110,Civil Engineering,49772,85.113912,Amravati
3,1002,"Government College of Engineering, Amravati",100219110,Civil Engineering,44576,86.682749,Amravati
4,1002,"Government College of Engineering, Amravati",100219110,Civil Engineering,70376,78.434257,Amravati


In [143]:

FILE_PATH = "Copy of MHTCET_Colleges_Finall_sheet(2).csv"
TOP_N_RECOMMENDATIONS = 10
ASPIRATIONAL_RANGE = 15.0 
df = pd.read_csv('Copy of MHTCET_Colleges_Finall_sheet(2).csv')
df.columns = df.columns.str.lower().str.replace(' ', '_')
df = df.drop(columns=['college_code', 'branch_code', 'cutoff_merit_no'], errors='ignore')
df.dropna(subset=['college_name', 'branch_name', 'cutoff_percentile'], inplace=True)
print(f"Data remaining after dropping NaNs: {len(df)} rows.")


Data remaining after dropping NaNs: 117736 rows.


In [144]:
print("\n--- 2. Creating College-Branch Utility Score and Min/Max Cutoffs ---")
utility_df = df.groupby(['college_name', 'branch_name']).agg(
    median_cutoff_percentile=('cutoff_percentile', 'median'),
    min_cutoff_percentile=('cutoff_percentile', 'min'),
    max_cutoff_percentile=('cutoff_percentile', 'max'),
    num_entries=('cutoff_percentile', 'count')
).reset_index()

utility_df.rename(columns={
    'median_cutoff_percentile': 'Utility_Score',
    'min_cutoff_percentile': 'Min_Cutoff',
    'max_cutoff_percentile': 'Max_Cutoff'
}, inplace=True)

utility_df = utility_df.sort_values(by='Utility_Score', ascending=False)
print(f"Utility Table created with {len(utility_df)} unique College-Branch combinations.")
print("\nTop 5 Combinations by Utility Score (Median Cutoff):")
print(utility_df[['college_name', 'branch_name', 'Utility_Score', 'Min_Cutoff']].head())



--- 2. Creating College-Branch Utility Score and Min/Max Cutoffs ---
Utility Table created with 2143 unique College-Branch combinations.

Top 5 Combinations by Utility Score (Median Cutoff):
                                           college_name  \
250                       COEP Technological University   
1710  Shri Vile Parle Kelvani Mandal's Dwarkadas J. ...   
1712  Shri Vile Parle Kelvani Mandal's Dwarkadas J. ...   
1715  Shri Vile Parle Kelvani Mandal's Dwarkadas J. ...   
1716  Shri Vile Parle Kelvani Mandal's Dwarkadas J. ...   

                                        branch_name  Utility_Score  Min_Cutoff  
250    Artificial Intelligence and Machine Learning      99.373041   93.667832  
1710  Artificial Intelligence (AI) and Data Science      98.828671   94.294461  
1712                           Computer Engineering      98.828671   94.294461  
1715         Electronics and Telecommunication Engg      98.828671   94.294461  
1716                         Information Technol

In [145]:
def recommend_colleges(student_percentile, branch_preference=None, location_preference=None, top_n=TOP_N_RECOMMENDATIONS):
    print(f"\n--- Generating Recommendations for Student Percentile: {student_percentile:.2f} ---")

    candidates = utility_df.copy()

    college_location_map = df[['college_name', 'location']].drop_duplicates().set_index('college_name').to_dict().get('location', {})
    candidates.loc[:, 'Location'] = candidates['college_name'].map(college_location_map)

    if branch_preference:
        branch_filter = candidates['branch_name'].str.contains(branch_preference, case=False, na=False)
        candidates = candidates[branch_filter]
        print(f"Applying branch filter for: '{branch_preference}'. Candidates remaining: {len(candidates)}")

        if candidates.empty:
            print("Warning: No branches matched the preference. Resetting candidates for broader search.")
            candidates = utility_df.copy()
            candidates.loc[:, 'Location'] = candidates['college_name'].map(college_location_map) 

    if location_preference:
        location_filter = candidates['Location'].str.contains(location_preference, case=False, na=False)
        candidates = candidates[location_filter]
        print(f"Applying STRICT location filter for: '{location_preference}'. Candidates remaining: {len(candidates)}")
        
        if candidates.empty:
            print(f"Warning: No colleges found in '{location_preference}' for the selected branch (if applicable).")
        
    reachable_filter = candidates['Min_Cutoff'] <= student_percentile

    aspirational_limit_filter = candidates['Utility_Score'] <= (student_percentile + ASPIRATIONAL_RANGE)
    
    recommended_candidates = candidates[reachable_filter & aspirational_limit_filter].copy()

    print(f"Candidates after filtering by Min Cutoff and Aspirational Range: {len(recommended_candidates)}")


    recommended_candidates = recommended_candidates.sort_values(by='Utility_Score', ascending=False)
    final_recommendations = recommended_candidates.head(top_n)

    if final_recommendations.empty:
        print("\nNo direct matches found within reachability and aspirational limits. Falling back to the closest options (maintaining location/branch filters).")

        fallback_candidates = candidates.copy()
        
   
        fallback_candidates.loc[:, 'Score_Difference'] = np.abs(fallback_candidates['Utility_Score'] - student_percentile)
        
 
        final_recommendations = fallback_candidates.sort_values(by='Score_Difference').head(top_n)

  
    output_cols = ['college_name', 'branch_name', 'Utility_Score']
    

    if 'Location' in final_recommendations.columns and (location_preference or final_recommendations['Location'].notna().any()):
        output_cols.append('Location')

    final_recommendations = final_recommendations.drop(
        columns=['Score_Difference'], errors='ignore'
    )
  
    return final_recommendations[output_cols].reset_index(drop=True)


print("\n--- 4. Evaluating Recommendation System Accuracy (Hit Rate) ---")


--- 4. Evaluating Recommendation System Accuracy (Hit Rate) ---


In [146]:
TEST_SAMPLE_SIZE = 100
np.random.seed(42) # for reproducibility
test_students = df.sample(TEST_SAMPLE_SIZE)

correct_recommendations = 0
total_tests = 0

for index, student_data in test_students.iterrows():
    student_percentile = student_data['cutoff_percentile']
    actual_college = student_data['college_name']
    actual_branch = student_data['branch_name']
    
    recommendations = recommend_colleges(
        student_percentile=student_percentile, 
        branch_preference=actual_branch, 
        top_n=TOP_N_RECOMMENDATIONS
    )
    
    is_recommended = ((recommendations['college_name'] == actual_college) & 
                      (recommendations['branch_name'] == actual_branch)).any()
    
    if is_recommended:
        correct_recommendations += 1
    
    total_tests += 1

hit_rate = (correct_recommendations / total_tests) * 100

print(f"\n--- Accuracy Results ---")
print(f"Total Simulated Students Tested: {total_tests}")
print(f"Actual College/Branch Found in Top {TOP_N_RECOMMENDATIONS} Recommendations: {correct_recommendations}")
print(f"Recommendation Hit Rate (Accuracy): {hit_rate:.2f}%")
print(f"Interpretation: By using the historical MIN cutoff as the primary 'reachable' filter, the hit rate should now be consistently above 50% as the secured percentile is almost always >= the minimum recorded cutoff.")



--- Generating Recommendations for Student Percentile: 74.06 ---
Applying branch filter for: 'Artificial Intelligence and Machine Learning'. Candidates remaining: 101
Candidates after filtering by Min Cutoff and Aspirational Range: 89

--- Generating Recommendations for Student Percentile: 90.88 ---
Applying branch filter for: 'Computer Engineering'. Candidates remaining: 270
Candidates after filtering by Min Cutoff and Aspirational Range: 268

--- Generating Recommendations for Student Percentile: 75.24 ---
Applying branch filter for: 'Civil Engineering'. Candidates remaining: 255
Candidates after filtering by Min Cutoff and Aspirational Range: 239

--- Generating Recommendations for Student Percentile: 95.21 ---
Applying branch filter for: 'Computer Engineering'. Candidates remaining: 270
Candidates after filtering by Min Cutoff and Aspirational Range: 270

--- Generating Recommendations for Student Percentile: 78.43 ---
Applying branch filter for: 'Computer Science and Engineering'

In [152]:
print("\n--- 5. Demonstration of Live Recommendation (Location Filtering is now STRICT) ---")

# Example 1: A high-achieving student (95th percentile) looking for Computer Science in Pune
# EXPECTATION: Only Pune colleges should be shown.
recommendation_95_pune = recommend_colleges(
    student_percentile=56.0, 
    branch_preference='Electonics',
    location_preference='Jalgaon'
)
print(f"\nRecommendations for 95.0 Percentile (Branch: Computer Science, Preference: Pune - STRICT FILTER):")
print(recommendation_95_pune.to_markdown(index=False))

# # Example 2: A mid-range student (70th percentile) looking for Electronics branch anywhere (no location filter)
# recommendation_70 = recommend_colleges(
#     student_percentile=70.0,
#     branch_preference='Electronics'
# )
# print(f"\nRecommendations for 70.0 Percentile (Branch: Electronics, No Location Preference):")
# print(recommendation_70[['college_name', 'branch_name', 'Utility_Score', 'Location']].to_markdown(index=False))

# # Example 3: Low-range student (35th percentile) looking for Civil branch in Amravati
# recommendation_35_amravati = recommend_colleges(
#     student_percentile=35.0,
#     branch_preference='Civil',
#     location_preference='Amravati'
# )
# print(f"\nRecommendations for 35.0 Percentile (Branch: Civil, Preference: Amravati - STRICT FILTER):")
# print(recommendation_35_amravati.to_markdown(index=False))



--- 5. Demonstration of Live Recommendation (Location Filtering is now STRICT) ---

--- Generating Recommendations for Student Percentile: 56.00 ---
Applying branch filter for: 'Electonics'. Candidates remaining: 0
Applying STRICT location filter for: 'Jalgaon'. Candidates remaining: 48
Candidates after filtering by Min Cutoff and Aspirational Range: 42

Recommendations for 95.0 Percentile (Branch: Computer Science, Preference: Pune - STRICT FILTER):
| college_name                                                                        | branch_name                                                                    |   Utility_Score | Location   |
|:------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------|----------------:|:-----------|
| G H Raisoni College of  Engineering and Management, Jalgaon                         | Computer Science and Engineering (Artificial Intelligenc