In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
import joblib

class CollegePredictionSystem:
    def __init__(self):
        self.data = None
        self.model = GradientBoostingClassifier(n_estimators=50, random_state=42)
        self.encoders = {}

    def load_data(self, csv_file):
        try:
            self.data = pd.read_csv(csv_file, encoding="windows-1252", on_bad_lines='skip')

            # Clean College_Name column
            self.data['College_Name'] = self.data['College_Name'].str.strip()

            required_columns = ['College_Name', 'Branch_Name', 'Location', 'OPEN_Score']
            missing_columns = [col for col in required_columns if col not in self.data.columns]
            if missing_columns:
                print(f"❌ Error: Missing required columns: {missing_columns}")
                return False

            print(f"✅ Successfully loaded {len(self.data)} records from {csv_file}")
            return True
        except Exception as e:
            print(f"❌ Error loading CSV file: {e}")
            return False

    def preprocess_data(self):
        if self.data is None:
            raise ValueError("❌ No data loaded. Please load data using load_data() first.")

        categorical_columns = ['Category', 'Branch_Name', 'College_Name', 'Location']
        for col in categorical_columns:
            self.encoders[col] = LabelEncoder()
            self.data[col] = self.encoders[col].fit_transform(self.data[col].astype(str))

        print("✅ Data preprocessing completed!")

    def transform_scores_by_category(self):
        # Create a long format DataFrame based on category
        score_columns = {
            'OPEN': 'OPEN_Score',
            'OBC': 'OBC_Score',
            'SC': 'SC_Score',
            'ST': 'ST_Score',
            'SBC': 'SBC_Score',
            'DT/VJ': 'DT/VJ_Score'
        }

        dfs = []
        for category, col in score_columns.items():
            temp_df = self.data[['College_Name', 'Branch_Name', 'Location', col]].copy()
            temp_df = temp_df.rename(columns={col: 'MHT_CET_Score'})
            temp_df['Category'] = category
            temp_df = temp_df.dropna(subset=['MHT_CET_Score'])
            dfs.append(temp_df)

        self.data = pd.concat(dfs, ignore_index=True)

    def train_model(self):
        self.transform_scores_by_category()
        self.preprocess_data()

        features = ['Category', 'MHT_CET_Score', 'Branch_Name', 'Location']
        target = 'College_Name'

        X = self.data[features]
        y = self.data[target]

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        self.model.fit(X_train, y_train)
        accuracy = self.model.score(X_test, y_test)

        print(f"✅ Model trained successfully with accuracy: {accuracy:.2f}")

        joblib.dump(self.model, 'college_predictor_model_v2.pkl')
        print("✅ Model saved as 'college_predictor_model_v2.pkl'")

        for col in self.encoders:
            joblib.dump(self.encoders[col], f'{col}_encoder.pkl')

    def predict_colleges(self, category, score, branch, location):
        try:
            encoded_input = {
                'Category': self.encoders['Category'].transform([category])[0],
                'MHT_CET_Score': score,
                'Branch_Name': self.encoders['Branch_Name'].transform([branch])[0],
                'Location': self.encoders['Location'].transform([location])[0] if location != "All" else -1
            }

            input_df = pd.DataFrame([encoded_input])
            probabilities = self.model.predict_proba(input_df)[0]

            all_colleges = self.encoders['College_Name'].inverse_transform(np.arange(len(probabilities)))
            full_results = pd.DataFrame({
                'College_Name': all_colleges,
                'Probability': probabilities
            })

            # Decode data for merging
            decoded_data = self.data.copy()
            decoded_data['College_Name'] = self.encoders['College_Name'].inverse_transform(decoded_data['College_Name'])
            decoded_data['Branch_Name'] = self.encoders['Branch_Name'].inverse_transform(decoded_data['Branch_Name'])
            decoded_data['Location'] = self.encoders['Location'].inverse_transform(decoded_data['Location'])
            unique_colleges = decoded_data[['College_Name', 'Branch_Name', 'Location']].drop_duplicates()

            full_results = full_results.merge(unique_colleges, on='College_Name', how='left')

            encoded_branch = self.encoders['Branch_Name'].transform([branch])[0]
            encoded_location = self.encoders['Location'].transform([location])[0] if location != "All" else None

            filtered = full_results[full_results['Branch_Name'] == branch]
            if location != "All":
                filtered = filtered[filtered['Location'] == location]

            if filtered.empty:
                print("⚠️ No matching colleges found for the given branch and location.")
                return []

            filtered['Probability'] = filtered['Probability'] / filtered['Probability'].sum()
            filtered = filtered.sort_values(by='Probability', ascending=False).head(15)
            filtered['Probability'] = 80 + (filtered['Probability'] * 19)

            result = list(zip(filtered['College_Name'], filtered['Probability'].round(2)))
            print("✅ Predictions generated successfully!")
            return result

        except Exception as e:
            print(f"❌ Error in prediction: {e}")
            return []

# Example usage
if __name__ == "__main__":
    predictor = CollegePredictionSystem()
    if predictor.load_data('College_Category_Score_Summary_Filled.csv'):
        predictor.train_model()
        predictions = predictor.predict_colleges('OPEN', 85.5, 'Computer Engineering', 'Pune')
        for idx, (college, prob) in enumerate(predictions, 1):
            print(f"{idx}. 🏫 College: {college}, Chance: {prob:.2f}%")


✅ Successfully loaded 1420 records from College_Category_Score_Summary_Filled.csv
✅ Data preprocessing completed!
✅ Model trained successfully with accuracy: 0.26
✅ Model saved as 'college_predictor_model_v2.pkl'
✅ Predictions generated successfully!
1. 🏫 College: K. J.'s Educational Institut Trinity College of Engineering and Research, Pisoli, Haveli, Chance: 81.35%
2. 🏫 College: Zeal Education Society's Zeal College of Engineering & Reserch, Narhe, Pune, Chance: 81.23%
3. 🏫 College: Jayawant Shikshan Prasarak Mandal, Bhivarabai Sawant Institute of Technology & Research, Wagholi, Chance: 81.08%
4. 🏫 College: Government College of Engineering & Research, Avasari Khurd, Chance: 80.81%
5. 🏫 College: Sinhgad Academy of Engineering, Kondhwa (BK) Kondhwa-Saswad Road, Pune, Chance: 80.78%
6. 🏫 College: KJEI's Trinity Academy of Engineering, Yewalewadi, Pune, Chance: 80.75%
7. 🏫 College: Sinhgad Technical Education Society, Sinhgad Institute of Technology and Science, Narhe, Chance: 80.72%
8.