In [1]:
pip install seaborn

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import joblib

class CollegePredictionSystem:
    def __init__(self):
        self.data = None
        self.model = GradientBoostingClassifier(n_estimators=50, random_state=42)
        self.encoders = {}

    def load_data(self, csv_file):
        try:
            self.data = pd.read_csv(csv_file, encoding="windows-1252", on_bad_lines='skip')

            # Clean College_Name column
            self.data['College_Name'] = self.data['College_Name'].str.strip()

            required_columns = ['College_Name', 'Branch_Name', 'Location', 'OPEN_Score']
            missing_columns = [col for col in required_columns if col not in self.data.columns]
            if missing_columns:
                print(f"❌ Error: Missing required columns: {missing_columns}")
                return False

            print(f"✅ Successfully loaded {len(self.data)} records from {csv_file}")
            return True
        except Exception as e:
            print(f"❌ Error loading CSV file: {e}")
            return False

    def preprocess_data(self):
        if self.data is None:
            raise ValueError("❌ No data loaded. Please load data using load_data() first.")

        categorical_columns = ['Category', 'Branch_Name', 'College_Name', 'Location']
        for col in categorical_columns:
            self.encoders[col] = LabelEncoder()
            self.data[col] = self.encoders[col].fit_transform(self.data[col].astype(str))

        print("✅ Data preprocessing completed!")

    def transform_scores_by_category(self):
        score_columns = {
            'OPEN': 'OPEN_Score',
            'OBC': 'OBC_Score',
            'SC': 'SC_Score',
            'ST': 'ST_Score',
            'SBC': 'SBC_Score',
            'DT/VJ': 'DT/VJ_Score'
        }

        dfs = []
        for category, col in score_columns.items():
            temp_df = self.data[['College_Name', 'Branch_Name', 'Location', col]].copy()
            temp_df = temp_df.rename(columns={col: 'MHT_CET_Score'})
            temp_df['Category'] = category
            temp_df = temp_df.dropna(subset=['MHT_CET_Score'])
            dfs.append(temp_df)

        self.data = pd.concat(dfs, ignore_index=True)

    def train_model(self):
        self.transform_scores_by_category()
        self.preprocess_data()

        features = ['Category', 'MHT_CET_Score', 'Branch_Name', 'Location']
        target = 'College_Name'

        X = self.data[features]
        y = self.data[target]

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        self.model.fit(X_train, y_train)

        y_pred = self.model.predict(X_test)

        # 🔹 Evaluation Metrics
        accuracy = accuracy_score(y_test, y_pred)
        print(f"✅ Model trained successfully with accuracy: {accuracy:.2f}")

        print("\n📊 Classification Report:")
        print(classification_report(y_test, y_pred))

        print("\n🧾 Confusion Matrix:")
        cm = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(12, 10))
        sns.heatmap(cm, annot=False, cmap='Blues', fmt='d')
        plt.title("Confusion Matrix")
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plt.show()

        # 🔸 Save model & encoders
        joblib.dump(self.model, 'college_predictor_model_v2.pkl')
        print("✅ Model saved as 'college_predictor_model_v2.pkl'")

        for col in self.encoders:
            joblib.dump(self.encoders[col], f'{col}_encoder.pkl')

    def predict_colleges(self, category, score, branch, location):
        try:
            encoded_input = {
                'Category': self.encoders['Category'].transform([category])[0],
                'MHT_CET_Score': score,
                'Branch_Name': self.encoders['Branch_Name'].transform([branch])[0],
                'Location': self.encoders['Location'].transform([location])[0] if location != "All" else -1
            }

            input_df = pd.DataFrame([encoded_input])
            probabilities = self.model.predict_proba(input_df)[0]

            all_colleges = self.encoders['College_Name'].inverse_transform(np.arange(len(probabilities)))
            full_results = pd.DataFrame({
                'College_Name': all_colleges,
                'Probability': probabilities
            })

            decoded_data = self.data.copy()
            decoded_data['College_Name'] = self.encoders['College_Name'].inverse_transform(decoded_data['College_Name'])
            decoded_data['Branch_Name'] = self.encoders['Branch_Name'].inverse_transform(decoded_data['Branch_Name'])
            decoded_data['Location'] = self.encoders['Location'].inverse_transform(decoded_data['Location'])
            unique_colleges = decoded_data[['College_Name', 'Branch_Name', 'Location']].drop_duplicates()

            full_results = full_results.merge(unique_colleges, on='College_Name', how='left')

            encoded_branch = self.encoders['Branch_Name'].transform([branch])[0]
            encoded_location = self.encoders['Location'].transform([location])[0] if location != "All" else None

            filtered = full_results[full_results['Branch_Name'] == branch]
            if location != "All":
                filtered = filtered[filtered['Location'] == location]

            if filtered.empty:
                print("⚠️ No matching colleges found for the given branch and location.")
                return []

            filtered['Probability'] = filtered['Probability'] / filtered['Probability'].sum()
            filtered = filtered.sort_values(by='Probability', ascending=False).head(15)
            filtered['Probability'] = 80 + (filtered['Probability'] * 19)

            result = list(zip(filtered['College_Name'], filtered['Probability'].round(2)))
            print("✅ Predictions generated successfully!")
            return result

        except Exception as e:
            print(f"❌ Error in prediction: {e}")
            return []

# Example usage
if __name__ == "__main__":
    predictor = CollegePredictionSystem()
    if predictor.load_data('College_Category_Score_Summary.csv'):
        predictor.train_model()
        predictions = predictor.predict_colleges('OPEN', 85.5, 'Computer Engineering', 'Pune')
        for idx, (college, prob) in enumerate(predictions, 1):
            print(f"{idx}. 🏫 College: {college}, Chance: {prob:.2f}%")


❌ Error loading CSV file: [Errno 2] No such file or directory: 'College_Category_Score_Summary.csv'


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import joblib

class CollegePredictionSystem:
    def __init__(self):
        self.data = None
        self.model = GradientBoostingClassifier(n_estimators=50, random_state=42)
        self.encoders = {}

    def load_data(self, csv_file):
        try:
            self.data = pd.read_csv(csv_file, encoding="windows-1252", on_bad_lines='skip')
            self.data['College_Name'] = self.data['College_Name'].str.strip()

            required_columns = ['College_Name', 'Branch_Name', 'Location', 'OPEN_Score']
            missing_columns = [col for col in required_columns if col not in self.data.columns]
            if missing_columns:
                print(f"❌ Error: Missing required columns: {missing_columns}")
                return False

            print(f"✅ Successfully loaded {len(self.data)} records from {csv_file}")
            return True
        except Exception as e:
            print(f"❌ Error loading CSV file: {e}")
            return False

    def preprocess_data(self):
        if self.data is None:
            raise ValueError("❌ No data loaded. Please load data using load_data() first.")

        categorical_columns = ['Category', 'Branch_Name', 'College_Name', 'Location']
        for col in categorical_columns:
            self.encoders[col] = LabelEncoder()
            self.data[col] = self.encoders[col].fit_transform(self.data[col].astype(str))

        print("✅ Data preprocessing completed!")

    def transform_scores_by_category(self):
        score_columns = {
            'OPEN': 'OPEN_Score',
            'OBC': 'OBC_Score',
            'SC': 'SC_Score',
            'ST': 'ST_Score',
            'SBC': 'SBC_Score',
            'DT/VJ': 'DT/VJ_Score'
        }

        dfs = []
        for category, col in score_columns.items():
            temp_df = self.data[['College_Name', 'Branch_Name', 'Location', col]].copy()
            temp_df = temp_df.rename(columns={col: 'MHT_CET_Score'})
            temp_df['Category'] = category
            temp_df = temp_df.dropna(subset=['MHT_CET_Score'])
            dfs.append(temp_df)

        self.data = pd.concat(dfs, ignore_index=True)

    def train_model(self):
        self.transform_scores_by_category()
        self.preprocess_data()

        features = ['Category', 'MHT_CET_Score', 'Branch_Name', 'Location']
        target = 'College_Name'

        X = self.data[features]
        y = self.data[target]

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        self.model.fit(X_train, y_train)
        y_pred = self.model.predict(X_test)

        # Evaluation
        accuracy = accuracy_score(y_test, y_pred)
        print(f"\n✅ Model trained with accuracy: {accuracy:.2f}")

        print("\n📊 Classification Report:")
        print(classification_report(y_test, y_pred))

        # Confusion matrix and top-N analysis
        unique_labels = np.unique(np.concatenate((y_test, y_pred)))
        cm = confusion_matrix(y_test, y_pred, labels=unique_labels)

        # Top 10 most frequent classes in y_test
        test_labels, counts = np.unique(y_test, return_counts=True)
        top_10_classes = test_labels[np.argsort(counts)][-10:]

        # Mapping top classes to confusion matrix indices
        class_index_map = {label: i for i, label in enumerate(unique_labels)}
        top_indices = [class_index_map[label] for label in top_10_classes if label in class_index_map]
        filtered_cm = cm[np.ix_(top_indices, top_indices)]

        plt.figure(figsize=(10, 8))
        sns.heatmap(filtered_cm, annot=True, fmt='d', cmap='Blues',
                    xticklabels=top_10_classes, yticklabels=top_10_classes)
        plt.title("Top 10 College Confusion Matrix")
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plt.show()

        # Per-class accuracy report
        per_class_accuracy = cm.diagonal() / cm.sum(axis=1)
        college_names = self.encoders['College_Name'].inverse_transform(unique_labels)
        accuracy_df = pd.DataFrame({
            'College_Name': college_names,
            'Accuracy (%)': (per_class_accuracy * 100).round(2)
        }).sort_values(by='Accuracy (%)', ascending=False)

        print("\n📈 Top 10 Classes by Accuracy:")
        print(accuracy_df.head(10).to_string(index=False))

        print("\n📉 Bottom 10 Classes by Accuracy:")
        print(accuracy_df.tail(10).to_string(index=False))

        # Save model and encoders
        joblib.dump(self.model, 'college_predictor_model_v2.pkl')
        print("✅ Model saved as 'college_predictor_model_v2.pkl'")

        for col in self.encoders:
            joblib.dump(self.encoders[col], f'{col}_encoder.pkl')

    def predict_colleges(self, category, score, branch, location):
        try:
            encoded_input = {
                'Category': self.encoders['Category'].transform([category])[0],
                'MHT_CET_Score': score,
                'Branch_Name': self.encoders['Branch_Name'].transform([branch])[0],
                'Location': self.encoders['Location'].transform([location])[0] if location != "All" else -1
            }

            input_df = pd.DataFrame([encoded_input])
            probabilities = self.model.predict_proba(input_df)[0]

            all_colleges = self.encoders['College_Name'].inverse_transform(np.arange(len(probabilities)))
            full_results = pd.DataFrame({
                'College_Name': all_colleges,
                'Probability': probabilities
            })

            decoded_data = self.data.copy()
            decoded_data['College_Name'] = self.encoders['College_Name'].inverse_transform(decoded_data['College_Name'])
            decoded_data['Branch_Name'] = self.encoders['Branch_Name'].inverse_transform(decoded_data['Branch_Name'])
            decoded_data['Location'] = self.encoders['Location'].inverse_transform(decoded_data['Location'])
            unique_colleges = decoded_data[['College_Name', 'Branch_Name', 'Location']].drop_duplicates()

            full_results = full_results.merge(unique_colleges, on='College_Name', how='left')

            filtered = full_results[full_results['Branch_Name'] == branch]
            if location != "All":
                filtered = filtered[filtered['Location'] == location]

            if filtered.empty:
                print("⚠️ No matching colleges found for the given branch and location.")
                return []

            filtered['Probability'] = filtered['Probability'] / filtered['Probability'].sum()
            filtered = filtered.sort_values(by='Probability', ascending=False).head(15)
            filtered['Probability'] = 80 + (filtered['Probability'] * 19)

            result = list(zip(filtered['College_Name'], filtered['Probability'].round(2)))
            print("✅ Predictions generated successfully!")
            return result

        except Exception as e:
            print(f"❌ Error in prediction: {e}")
            return []

# Example usage
if __name__ == "__main__":
    predictor = CollegePredictionSystem()
    if predictor.load_data('College_Category_Score_Summary.csv'):
        predictor.train_model()
        predictions = predictor.predict_colleges('OPEN', 85.5, 'Computer Engineering', 'Pune')
        for idx, (college, prob) in enumerate(predictions, 1):
            print(f"{idx}. 🏫 College: {college}, Chance: {prob:.2f}%")




❌ Error loading CSV file: [Errno 2] No such file or directory: 'College_Category_Score_Summary.csv'


In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix, 
    top_k_accuracy_score, log_loss, roc_auc_score, roc_curve, auc
)
import joblib
from collections import Counter

class CollegePredictionSystem:
    def __init__(self):
        self.data = None
        self.model = GradientBoostingClassifier(n_estimators=50, random_state=42)
        self.encoders = {}

    def load_data(self, csv_file):
        try:
            self.data = pd.read_csv(csv_file, encoding="windows-1252", on_bad_lines='skip')
            self.data['College_Name'] = self.data['College_Name'].str.strip()

            required_columns = ['College_Name', 'Branch_Name', 'Location', 'OPEN_Score']
            missing_columns = [col for col in required_columns if col not in self.data.columns]
            if missing_columns:
                print(f"❌ Error: Missing required columns: {missing_columns}")
                return False

            print(f"✅ Successfully loaded {len(self.data)} records from {csv_file}")
            return True
        except Exception as e:
            print(f"❌ Error loading CSV file: {e}")
            return False

    def preprocess_data(self):
        if self.data is None:
            raise ValueError("❌ No data loaded. Please load data using load_data() first.")

        categorical_columns = ['Category', 'Branch_Name', 'College_Name', 'Location']
        for col in categorical_columns:
            self.encoders[col] = LabelEncoder()
            self.data[col] = self.encoders[col].fit_transform(self.data[col].astype(str))

        print("✅ Data preprocessing completed!")

    def transform_scores_by_category(self):
        score_columns = {
            'OPEN': 'OPEN_Score',
            'OBC': 'OBC_Score',
            'SC': 'SC_Score',
            'ST': 'ST_Score',
            'SBC': 'SBC_Score',
            'DT/VJ': 'DT/VJ_Score'
        }

        dfs = []
        for category, col in score_columns.items():
            temp_df = self.data[['College_Name', 'Branch_Name', 'Location', col]].copy()
            temp_df = temp_df.rename(columns={col: 'MHT_CET_Score'})
            temp_df['Category'] = category
            temp_df = temp_df.dropna(subset=['MHT_CET_Score'])
            dfs.append(temp_df)

        self.data = pd.concat(dfs, ignore_index=True)

    def train_model(self):
        self.transform_scores_by_category()
        self.preprocess_data()

        features = ['Category', 'MHT_CET_Score', 'Branch_Name', 'Location']
        target = 'College_Name'

        X = self.data[features]
        y = self.data[target]

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        self.model.fit(X_train, y_train)
        y_pred = self.model.predict(X_test)
        y_proba = self.model.predict_proba(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        print(f"\n✅ Model trained with accuracy: {accuracy:.2f}")

        print("\n📊 Classification Report:")
        print(classification_report(y_test, y_pred))

        print("\n🎯 Top-K Accuracy:")
        print("Top-1 Accuracy:", top_k_accuracy_score(y_test, y_proba, k=1))
        print("Top-5 Accuracy:", top_k_accuracy_score(y_test, y_proba, k=5))

        print("\n📉 Log Loss:")
        print("Log Loss:", log_loss(y_test, y_proba))

        all_classes = self.model.classes_
        y_test_bin = label_binarize(y_test, classes=all_classes)
        if y_test_bin.shape[1] != y_proba.shape[1]:
            raise ValueError("Mismatch in number of classes between y_true and y_score. Check label encoding consistency.")

        print("\n📈 ROC-AUC (OvR):")
        print("Macro AUC:", roc_auc_score(y_test_bin, y_proba, average="macro", multi_class="ovr"))

        unique_labels = np.unique(np.concatenate((y_test, y_pred)))
        cm = confusion_matrix(y_test, y_pred, labels=unique_labels)
        test_labels, counts = np.unique(y_test, return_counts=True)
        top_10_classes = test_labels[np.argsort(counts)][-10:]
        class_index_map = {label: i for i, label in enumerate(unique_labels)}
        top_indices = [class_index_map[label] for label in top_10_classes if label in class_index_map]
        filtered_cm = cm[np.ix_(top_indices, top_indices)]

        plt.figure(figsize=(10, 8))
        sns.heatmap(filtered_cm, annot=True, fmt='d', cmap='Blues',
                    xticklabels=top_10_classes, yticklabels=top_10_classes)
        plt.title("Top 10 College Confusion Matrix")
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plt.show()

        plt.figure(figsize=(10, 7))
        for class_id in top_10_classes:
            if class_id in all_classes:
                class_idx = np.where(all_classes == class_id)[0][0]
                fpr, tpr, _ = roc_curve(y_test_bin[:, class_idx], y_proba[:, class_idx])
                roc_auc_i = auc(fpr, tpr)
                plt.plot(fpr, tpr, label=f"Class {class_id} (AUC={roc_auc_i:.2f})")

        plt.plot([0, 1], [0, 1], "k--")
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.title("ROC Curves for Top 10 Classes")
        plt.legend(loc="lower right")
        plt.grid(True)
        plt.tight_layout()
        plt.show()

        joblib.dump(self.model, 'college_predictor_model_v2.pkl')
        print("✅ Model saved as 'college_predictor_model_v2.pkl'")

        for col in self.encoders:
            joblib.dump(self.encoders[col], f'{col}_encoder.pkl')

    def predict_colleges(self, category, score, branch, location):
        try:
            encoded_input = {
                'Category': self.encoders['Category'].transform([category])[0],
                'MHT_CET_Score': score,
                'Branch_Name': self.encoders['Branch_Name'].transform([branch])[0],
                'Location': self.encoders['Location'].transform([location])[0] if location != "All" else -1
            }

            input_df = pd.DataFrame([encoded_input])
            probabilities = self.model.predict_proba(input_df)[0]

            all_colleges = self.encoders['College_Name'].inverse_transform(np.arange(len(probabilities)))
            full_results = pd.DataFrame({
                'College_Name': all_colleges,
                'Probability': probabilities
            })

            decoded_data = self.data.copy()
            decoded_data['College_Name'] = self.encoders['College_Name'].inverse_transform(decoded_data['College_Name'])
            decoded_data['Branch_Name'] = self.encoders['Branch_Name'].inverse_transform(decoded_data['Branch_Name'])
            decoded_data['Location'] = self.encoders['Location'].inverse_transform(decoded_data['Location'])
            unique_colleges = decoded_data[['College_Name', 'Branch_Name', 'Location']].drop_duplicates()

            full_results = full_results.merge(unique_colleges, on='College_Name', how='left')

            filtered = full_results[full_results['Branch_Name'] == branch]
            if location != "All":
                filtered = filtered[filtered['Location'] == location]

            if filtered.empty:
                print("⚠️ No matching colleges found for the given branch and location.")
                return []

            filtered['Probability'] = filtered['Probability'] / filtered['Probability'].sum()
            filtered = filtered.sort_values(by='Probability', ascending=False).head(15)
            filtered['Probability'] = 80 + (filtered['Probability'] * 19)

            result = list(zip(filtered['College_Name'], filtered['Probability'].round(2)))
            print("✅ Predictions generated successfully!")
            return result

        except Exception as e:
            print(f"❌ Error in prediction: {e}")
            return []

if __name__ == "__main__":
    predictor = CollegePredictionSystem()
    if predictor.load_data('College_Category_Score_Summary.csv'):
        predictor.train_model()
        predictions = predictor.predict_colleges('OPEN', 85.5, 'Computer Engineering', 'Pune')
        for idx, (college, prob) in enumerate(predictions, 1):
            print(f"{idx}. 🏫 College: {college}, Chance: {prob:.2f}%")

❌ Error loading CSV file: [Errno 2] No such file or directory: 'College_Category_Score_Summary.csv'


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    top_k_accuracy_score, roc_auc_score, roc_curve, auc, log_loss
)
import seaborn as sns
import matplotlib.pyplot as plt
import joblib
from collections import Counter

class CollegePredictionSystem:
    def __init__(self):
        self.data = None
        self.model = GradientBoostingClassifier(n_estimators=50, random_state=42)
        self.encoders = {}

    def load_data(self, csv_file):
        try:
            self.data = pd.read_csv(csv_file, encoding="windows-1252", on_bad_lines='skip')
            self.data['College_Name'] = self.data['College_Name'].str.strip()

            required_columns = ['College_Name', 'Branch_Name', 'Location', 'OPEN_Score']
            missing_columns = [col for col in required_columns if col not in self.data.columns]
            if missing_columns:
                print(f"❌ Error: Missing required columns: {missing_columns}")
                return False

            print(f"✅ Successfully loaded {len(self.data)} records from {csv_file}")
            return True
        except Exception as e:
            print(f"❌ Error loading CSV file: {e}")
            return False

    def preprocess_data(self):
        if self.data is None:
            raise ValueError("❌ No data loaded. Please load data using load_data() first.")

        categorical_columns = ['Category', 'Branch_Name', 'College_Name', 'Location']
        for col in categorical_columns:
            self.encoders[col] = LabelEncoder()
            self.data[col] = self.encoders[col].fit_transform(self.data[col].astype(str))

        print("✅ Data preprocessing completed!")

    def transform_scores_by_category(self):
        score_columns = {
            'OPEN': 'OPEN_Score',
            'OBC': 'OBC_Score',
            'SC': 'SC_Score',
            'ST': 'ST_Score',
            'SBC': 'SBC_Score',
            'DT/VJ': 'DT/VJ_Score'
        }

        dfs = []
        for category, col in score_columns.items():
            temp_df = self.data[['College_Name', 'Branch_Name', 'Location', col]].copy()
            temp_df = temp_df.rename(columns={col: 'MHT_CET_Score'})
            temp_df['Category'] = category
            temp_df = temp_df.dropna(subset=['MHT_CET_Score'])
            dfs.append(temp_df)

        self.data = pd.concat(dfs, ignore_index=True)

    def train_model(self):
        self.transform_scores_by_category()
        self.preprocess_data()

        features = ['Category', 'MHT_CET_Score', 'Branch_Name', 'Location']
        target = 'College_Name'

        X = self.data[features]
        y = self.data[target]

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        self.model.fit(X_train, y_train)
        y_pred = self.model.predict(X_test)
        y_proba = self.model.predict_proba(X_test)

        # Evaluation
        accuracy = accuracy_score(y_test, y_pred)
        print(f"\n✅ Model trained with accuracy: {accuracy:.2f}")

        print("\n📊 Classification Report:")
        print(classification_report(y_test, y_pred, digits=3))

        print("\n=== Top-K Accuracy ===")
        print("Top-1 Accuracy:", top_k_accuracy_score(y_test, y_proba, k=1))
        print("Top-5 Accuracy:", top_k_accuracy_score(y_test, y_proba, k=5))

        print("\n=== Log Loss ===")
        print("Log Loss:", log_loss(y_test, y_proba))

        # Confusion matrix and top-N analysis
        unique_labels = np.unique(np.concatenate((y_test, y_pred)))
        cm = confusion_matrix(y_test, y_pred, labels=unique_labels)

        # Top 10 most frequent classes in y_test
        test_labels, counts = np.unique(y_test, return_counts=True)
        top_10_classes = test_labels[np.argsort(counts)][-10:]

        # Mapping top classes to confusion matrix indices
        class_index_map = {label: i for i, label in enumerate(unique_labels)}
        top_indices = [class_index_map[label] for label in top_10_classes if label in class_index_map]
        filtered_cm = cm[np.ix_(top_indices, top_indices)]

        plt.figure(figsize=(10, 8))
        sns.heatmap(filtered_cm, annot=True, fmt='d', cmap='Blues',
                    xticklabels=top_10_classes, yticklabels=top_10_classes)
        plt.title("Top 10 College Confusion Matrix")
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plt.show()

        # ROC-AUC (One-vs-Rest)
        y_bin = label_binarize(y_test, classes=unique_labels)
        roc_auc = roc_auc_score(y_bin, y_proba, average="macro", multi_class="ovr")
        print("\n=== ROC-AUC (OvR): ===")
        print(f"Macro ROC-AUC (OvR): {roc_auc:.3f}")

        # Plot ROC Curves
        plt.figure(figsize=(10, 7))
        for idx, label in enumerate(unique_labels[:10]):  # Plot for top 10 classes
            fpr, tpr, _ = roc_curve(y_bin[:, idx], y_proba[:, idx])
            auc_score = auc(fpr, tpr)
            plt.plot(fpr, tpr, label=f"Class {label} (AUC = {auc_score:.2f})")

        plt.plot([0, 1], [0, 1], "k--")
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.title("ROC Curves for Top 10 Classes")
        plt.legend(loc="lower right")
        plt.grid()
        plt.show()

        # Save model and encoders
        joblib.dump(self.model, 'college_predictor_model_v2.pkl')
        print("✅ Model saved as 'college_predictor_model_v2.pkl'")

        for col in self.encoders:
            joblib.dump(self.encoders[col], f'{col}_encoder.pkl')

# Example usage
if __name__ == "__main__":
    predictor = CollegePredictionSystem()
    if predictor.load_data('College_Category_Score_Summary.csv'):
        predictor.train_model()


❌ Error loading CSV file: [Errno 2] No such file or directory: 'College_Category_Score_Summary.csv'


In [6]:
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import numpy as np

# Binarize the labels with consistent class order
y_test_bin = label_binarize(y_test, classes=model.classes_)

# Plot ROC for top N classes (by support)
N = 10
top_classes = np.argsort(np.bincount(y_test))[-N:]  # top N most common class indices

plt.figure(figsize=(10, 7))
for class_idx in top_classes:
    if class_idx < y_proba.shape[1]:
        fpr, tpr, _ = roc_curve(y_test_bin[:, class_idx], y_proba[:, class_idx])
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, label=f"Class {class_idx} (AUC={roc_auc:.2f})")

plt.plot([0, 1], [0, 1], "k--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curves for Top 10 Classes")
plt.legend(loc="lower right")
plt.grid(True)
plt.tight_layout()
plt.show()


NameError: name 'y_test' is not defined

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix, 
    top_k_accuracy_score, log_loss, roc_auc_score, roc_curve, auc
)
import joblib
from collections import Counter

class CollegePredictionSystem:
    def __init__(self):
        self.data = None
        self.model = GradientBoostingClassifier(n_estimators=50, random_state=42)
        self.encoders = {}

    def load_data(self, csv_file):
        try:
            self.data = pd.read_csv(csv_file, encoding="windows-1252", on_bad_lines='skip')
            self.data['College_Name'] = self.data['College_Name'].str.strip()

            required_columns = ['College_Name', 'Branch_Name', 'Location', 'OPEN_Score']
            missing_columns = [col for col in required_columns if col not in self.data.columns]
            if missing_columns:
                print(f"❌ Error: Missing required columns: {missing_columns}")
                return False

            print(f"✅ Successfully loaded {len(self.data)} records from {csv_file}")
            return True
        except Exception as e:
            print(f"❌ Error loading CSV file: {e}")
            return False

    def preprocess_data(self):
        if self.data is None:
            raise ValueError("❌ No data loaded. Please load data using load_data() first.")

        categorical_columns = ['Category', 'Branch_Name', 'College_Name', 'Location']
        for col in categorical_columns:
            self.encoders[col] = LabelEncoder()
            self.data[col] = self.encoders[col].fit_transform(self.data[col].astype(str))

        print("✅ Data preprocessing completed!")

    def transform_scores_by_category(self):
        score_columns = {
            'OPEN': 'OPEN_Score',
            'OBC': 'OBC_Score',
            'SC': 'SC_Score',
            'ST': 'ST_Score',
            'SBC': 'SBC_Score',
            'DT/VJ': 'DT/VJ_Score'
        }

        dfs = []
        for category, col in score_columns.items():
            temp_df = self.data[['College_Name', 'Branch_Name', 'Location', col]].copy()
            temp_df = temp_df.rename(columns={col: 'MHT_CET_Score'})
            temp_df['Category'] = category
            temp_df = temp_df.dropna(subset=['MHT_CET_Score'])
            dfs.append(temp_df)

        self.data = pd.concat(dfs, ignore_index=True)

    from collections import Counter

    def train_model(self):
        self.transform_scores_by_category()
        self.preprocess_data()

        features = ['Category', 'MHT_CET_Score', 'Branch_Name', 'Location']
        target = 'College_Name'

        X = self.data[features]
        y = self.data[target]

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        self.model.fit(X_train, y_train)
        y_pred = self.model.predict(X_test)
        y_proba = self.model.predict_proba(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        print(f"\n✅ Model trained with accuracy: {accuracy:.2f}")

        print("\n📊 Classification Report:")
        print(classification_report(y_test, y_pred))

        all_classes = self.model.classes_

        print("\n🎯 Top-K Accuracy:")
        print("Top-1 Accuracy:", top_k_accuracy_score(y_test, y_proba, labels=all_classes, k=1))
        print("Top-5 Accuracy:", top_k_accuracy_score(y_test, y_proba, labels=all_classes, k=5))

        print("\n📉 Log Loss:")
        print("Log Loss:", log_loss(y_test, y_proba, labels=all_classes))

    # One-hot encode y_test for ROC-AUC
        y_test_bin = label_binarize(y_test, classes=all_classes)

        print("\n📈 ROC-AUC (OvR):")
        try:
            macro_auc = roc_auc_score(y_test_bin, y_proba, average="macro", multi_class="ovr")
            print("Macro AUC:", macro_auc)
        except ValueError as e:
            print(f"⚠️ ROC AUC Error: {e}")

    # Plot ROC for Top-N frequent classes
        N = 10
        top_classes = [label for label, _ in Counter(y_test).most_common(N)]
        top_indices = [np.where(all_classes == label)[0][0] for label in top_classes if label in all_classes]

        plt.figure(figsize=(10, 7))
        for idx in top_indices:
            fpr, tpr, _ = roc_curve(y_test_bin[:, idx], y_proba[:, idx])
            roc_auc = auc(fpr, tpr)
            plt.plot(fpr, tpr, label=f"Class {self.encoders['College_Name'].inverse_transform([all_classes[idx]])[0]} (AUC={roc_auc:.2f})")

        plt.plot([0, 1], [0, 1], "k--")
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.title("ROC Curves for Top 10 Most Frequent Colleges")
        plt.legend(loc="lower right")
        plt.grid(True)
        plt.tight_layout()
        plt.show()

        joblib.dump(self.model, 'college_predictor_model_v2.pkl')
        print("✅ Model saved as 'college_predictor_model_v2.pkl'")

        for col in self.encoders:
            joblib.dump(self.encoders[col], f'{col}_encoder.pkl')


    def predict_colleges(self, category, score, branch, location):
        try:
            encoded_input = {
                'Category': self.encoders['Category'].transform([category])[0],
                'MHT_CET_Score': score,
                'Branch_Name': self.encoders['Branch_Name'].transform([branch])[0],
                'Location': self.encoders['Location'].transform([location])[0] if location != "All" else -1
            }

            input_df = pd.DataFrame([encoded_input])
            probabilities = self.model.predict_proba(input_df)[0]

            all_colleges = self.encoders['College_Name'].inverse_transform(np.arange(len(probabilities)))
            full_results = pd.DataFrame({
                'College_Name': all_colleges,
                'Probability': probabilities
            })

            decoded_data = self.data.copy()
            decoded_data['College_Name'] = self.encoders['College_Name'].inverse_transform(decoded_data['College_Name'])
            decoded_data['Branch_Name'] = self.encoders['Branch_Name'].inverse_transform(decoded_data['Branch_Name'])
            decoded_data['Location'] = self.encoders['Location'].inverse_transform(decoded_data['Location'])
            unique_colleges = decoded_data[['College_Name', 'Branch_Name', 'Location']].drop_duplicates()

            full_results = full_results.merge(unique_colleges, on='College_Name', how='left')

            filtered = full_results[full_results['Branch_Name'] == branch]
            if location != "All":
                filtered = filtered[filtered['Location'] == location]

            if filtered.empty:
                print("⚠️ No matching colleges found for the given branch and location.")
                return []

            filtered['Probability'] = filtered['Probability'] / filtered['Probability'].sum()
            filtered = filtered.sort_values(by='Probability', ascending=False).head(15)
            filtered['Probability'] = 80 + (filtered['Probability'] * 19)

            result = list(zip(filtered['College_Name'], filtered['Probability'].round(2)))
            print("✅ Predictions generated successfully!")
            return result

        except Exception as e:
            print(f"❌ Error in prediction: {e}")
            return []

if __name__ == "__main__":
    predictor = CollegePredictionSystem()
    if predictor.load_data('College_Category_Score_Summary.csv'):
        predictor.train_model()
        predictions = predictor.predict_colleges('OPEN', 85.5, 'Computer Engineering', 'Pune')
        for idx, (college, prob) in enumerate(predictions, 1):
            print(f"{idx}. 🏫 College: {college}, Chance: {prob:.2f}%")


In [None]:
pip install imblearn

In [None]:
%pip install imbalanced-learn

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import joblib
from imblearn.under_sampling import RandomUnderSampler  # <-- added import


class CollegePredictionSystem:
    def __init__(self):
        self.data = None
        self.model = GradientBoostingClassifier(n_estimators=50, random_state=42)
        self.encoders = {}

    def load_data(self, csv_file):
        try:
            self.data = pd.read_csv(csv_file, encoding="windows-1252", on_bad_lines='skip')

            # Clean College_Name column
            self.data['College_Name'] = self.data['College_Name'].str.strip()

            required_columns = ['College_Name', 'Branch_Name', 'Location', 'OPEN_Score']
            missing_columns = [col for col in required_columns if col not in self.data.columns]
            if missing_columns:
                print(f"❌ Error: Missing required columns: {missing_columns}")
                return False

            print(f"✅ Successfully loaded {len(self.data)} records from {csv_file}")
            return True
        except Exception as e:
            print(f"❌ Error loading CSV file: {e}")
            return False

    def preprocess_data(self):
        if self.data is None:
            raise ValueError("❌ No data loaded. Please load data using load_data() first.")

        categorical_columns = ['Category', 'Branch_Name', 'College_Name', 'Location']
        for col in categorical_columns:
            self.encoders[col] = LabelEncoder()
            self.data[col] = self.encoders[col].fit_transform(self.data[col].astype(str))

        print("✅ Data preprocessing completed!")

    def transform_scores_by_category(self):
        score_columns = {
            'OPEN': 'OPEN_Score',
            'OBC': 'OBC_Score',
            'SC': 'SC_Score',
            'ST': 'ST_Score',
            'SBC': 'SBC_Score',
            'DT/VJ': 'DT/VJ_Score'
        }

        dfs = []
        for category, col in score_columns.items():
            temp_df = self.data[['College_Name', 'Branch_Name', 'Location', col]].copy()
            temp_df = temp_df.rename(columns={col: 'MHT_CET_Score'})
            temp_df['Category'] = category
            temp_df = temp_df.dropna(subset=['MHT_CET_Score'])
            dfs.append(temp_df)

        self.data = pd.concat(dfs, ignore_index=True)

    def train_model(self):
        self.transform_scores_by_category()
        self.preprocess_data()

        features = ['Category', 'MHT_CET_Score', 'Branch_Name', 'Location']
        target = 'College_Name'

        X = self.data[features]
        y = self.data[target]

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )

        # Apply undersampling on training data
        rus = RandomUnderSampler(random_state=42)
        X_train_res, y_train_res = rus.fit_resample(X_train, y_train)

        print("Before undersampling:\n", y_train.value_counts())
        print("After undersampling:\n", y_train_res.value_counts())

        # Train model on resampled data
        self.model.fit(X_train_res, y_train_res)

        y_pred = self.model.predict(X_test)

        # Evaluation Metrics
        accuracy = accuracy_score(y_test, y_pred)
        print(f"✅ Model trained successfully with accuracy: {accuracy:.2f}")

        print("\n📊 Classification Report:")
        print(classification_report(y_test, y_pred))

        print("\n🧾 Confusion Matrix:")
        cm = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(12, 10))
        sns.heatmap(cm, annot=False, cmap='Blues', fmt='d')
        plt.title("Confusion Matrix")
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plt.show()

        # Save model & encoders
        joblib.dump(self.model, 'college_predictor_model_v2.pkl')
        print("✅ Model saved as 'college_predictor_model_v2.pkl'")

        for col in self.encoders:
            joblib.dump(self.encoders[col], f'{col}_encoder.pkl')

    def predict_colleges(self, category, score, branch, location):
        try:
            encoded_input = {
                'Category': self.encoders['Category'].transform([category])[0],
                'MHT_CET_Score': score,
                'Branch_Name': self.encoders['Branch_Name'].transform([branch])[0],
                'Location': self.encoders['Location'].transform([location])[0] if location != "All" else -1
            }

            input_df = pd.DataFrame([encoded_input])
            probabilities = self.model.predict_proba(input_df)[0]

            all_colleges = self.encoders['College_Name'].inverse_transform(np.arange(len(probabilities)))
            full_results = pd.DataFrame({
                'College_Name': all_colleges,
                'Probability': probabilities
            })

            decoded_data = self.data.copy()
            decoded_data['College_Name'] = self.encoders['College_Name'].inverse_transform(decoded_data['College_Name'])
            decoded_data['Branch_Name'] = self.encoders['Branch_Name'].inverse_transform(decoded_data['Branch_Name'])
            decoded_data['Location'] = self.encoders['Location'].inverse_transform(decoded_data['Location'])
            unique_colleges = decoded_data[['College_Name', 'Branch_Name', 'Location']].drop_duplicates()

            full_results = full_results.merge(unique_colleges, on='College_Name', how='left')

            # Note: you had a bug here - filtering by encoded values but comparing with original strings
            # Fix: filter by original string columns to match user inputs
            filtered = full_results[
                (full_results['Branch_Name'] == branch)
            ]
            if location != "All":
                filtered = filtered[filtered['Location'] == location]

            if filtered.empty:
                print("⚠️ No matching colleges found for the given branch and location.")
                return []

            filtered['Probability'] = filtered['Probability'] / filtered['Probability'].sum()
            filtered = filtered.sort_values(by='Probability', ascending=False).head(15)
            filtered['Probability'] = 80 + (filtered['Probability'] * 19)

            result = list(zip(filtered['College_Name'], filtered['Probability'].round(2)))
            print("✅ Predictions generated successfully!")
            return result

        except Exception as e:
            print(f"❌ Error in prediction: {e}")
            return []


# Example usage
if __name__ == "__main__":
    predictor = CollegePredictionSystem()
    if predictor.load_data('College_Category_Score_Summary.csv'):
        predictor.train_model()
        predictions = predictor.predict_colleges('OPEN', 85.5, 'Computer Engineering', 'Pune')
        for idx, (college, prob) in enumerate(predictions, 1):
            print(f"{idx}. 🏫 College: {college}, Chance: {prob:.2f}%")


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import joblib

class CollegePredictionSystem:
    def __init__(self):
        self.data = None
        self.model = GradientBoostingClassifier(n_estimators=50, random_state=42)
        self.encoders = {}
        self.scaler = MinMaxScaler()

    def load_data(self, csv_file):
        try:
            self.data = pd.read_csv(csv_file, encoding="windows-1252", on_bad_lines='skip')
            self.data['College_Name'] = self.data['College_Name'].str.strip()

            required_columns = ['College_Name', 'Branch_Name', 'Location', 'OPEN_Score']
            missing_columns = [col for col in required_columns if col not in self.data.columns]
            if missing_columns:
                print(f"❌ Error: Missing required columns: {missing_columns}")
                return False

            print(f"✅ Successfully loaded {len(self.data)} records from {csv_file}")
            return True
        except Exception as e:
            print(f"❌ Error loading CSV file: {e}")
            return False

    def transform_scores_by_category(self):
        score_columns = {
            'OPEN': 'OPEN_Score',
            'OBC': 'OBC_Score',
            'SC': 'SC_Score',
            'ST': 'ST_Score',
            'SBC': 'SBC_Score',
            'DT/VJ': 'DT/VJ_Score'
        }

        dfs = []
        for category, col in score_columns.items():
            if col not in self.data.columns:
                # Skip if category score column does not exist
                continue
            temp_df = self.data[['College_Name', 'Branch_Name', 'Location', col]].copy()
            temp_df = temp_df.rename(columns={col: 'MHT_CET_Score'})
            temp_df['Category'] = category
            temp_df = temp_df.dropna(subset=['MHT_CET_Score'])
            dfs.append(temp_df)

        self.data = pd.concat(dfs, ignore_index=True)

    def preprocess_data(self):
        if self.data is None:
            raise ValueError("❌ No data loaded. Please load data using load_data() first.")

        categorical_columns = ['Category', 'Branch_Name', 'College_Name', 'Location']
        for col in categorical_columns:
            self.encoders[col] = LabelEncoder()
            self.data[col] = self.encoders[col].fit_transform(self.data[col].astype(str))

        # Scale MHT_CET_Score to 0-1 range
        self.data['MHT_CET_Score'] = self.scaler.fit_transform(self.data[['MHT_CET_Score']])

        print("✅ Data preprocessing completed!")

    def train_model(self):
        self.transform_scores_by_category()
        self.preprocess_data()

        features = ['Category', 'MHT_CET_Score', 'Branch_Name', 'Location']
        target = 'College_Name'

        X = self.data[features]
        y = self.data[target]

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        self.model.fit(X_train, y_train)
        y_pred = self.model.predict(X_test)

        # Evaluation
        accuracy = accuracy_score(y_test, y_pred)
        print(f"\n✅ Model trained with accuracy: {accuracy:.2f}")

       # print("\n📊 Classification Report:")
        #print(classification_report(y_test, y_pred))

        # Confusion matrix and top-N analysis
        unique_labels = np.unique(np.concatenate((y_test, y_pred)))
        cm = confusion_matrix(y_test, y_pred, labels=unique_labels)

        # Top 10 most frequent classes in y_test
        test_labels, counts = np.unique(y_test, return_counts=True)
        top_10_classes = test_labels[np.argsort(counts)][-10:]

        # Mapping top classes to confusion matrix indices
        class_index_map = {label: i for i, label in enumerate(unique_labels)}
        top_indices = [class_index_map[label] for label in top_10_classes if label in class_index_map]
        filtered_cm = cm[np.ix_(top_indices, top_indices)]

        plt.figure(figsize=(10, 8))
        sns.heatmap(filtered_cm, annot=True, fmt='d', cmap='Blues',
                    xticklabels=top_10_classes, yticklabels=top_10_classes)
        plt.title("Top 10 College Confusion Matrix")
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plt.show()

        # Per-class accuracy report
        per_class_accuracy = cm.diagonal() / cm.sum(axis=1)
        college_names = self.encoders['College_Name'].inverse_transform(unique_labels)
        accuracy_df = pd.DataFrame({
            'College_Name': college_names,
            'Accuracy (%)': (per_class_accuracy * 100).round(2)
        }).sort_values(by='Accuracy (%)', ascending=False)

        print("\n📈 Top 10 Classes by Accuracy:")
        print(accuracy_df.head(10).to_string(index=False))

        print("\n📉 Bottom 10 Classes by Accuracy:")
        print(accuracy_df.tail(10).to_string(index=False))

        # Save model, encoders, and scaler
        joblib.dump(self.model, 'college_predictor_model_v2.pkl')
        print("✅ Model saved as 'college_predictor_model_v2.pkl'")

        for col in self.encoders:
            joblib.dump(self.encoders[col], f'{col}_encoder.pkl')
        joblib.dump(self.scaler, 'score_scaler.pkl')
        print("✅ Encoders and scaler saved.")

    def predict_colleges(self, category, score, branch, location):
        try:
            # Scale input score using saved scaler
            scaled_score = self.scaler.transform([[score]])[0][0]

            encoded_input = {
                'Category': self.encoders['Category'].transform([category])[0],
                'MHT_CET_Score': scaled_score,
                'Branch_Name': self.encoders['Branch_Name'].transform([branch])[0],
                # For 'All' location, do not filter on location (use a special value -1)
                'Location': self.encoders['Location'].transform([location])[0] if location != "All" else -1
            }

            input_df = pd.DataFrame([encoded_input])
            probabilities = self.model.predict_proba(input_df)[0]

            all_colleges = self.encoders['College_Name'].inverse_transform(np.arange(len(probabilities)))
            full_results = pd.DataFrame({
                'College_Name': all_colleges,
                'Probability': probabilities
            })

            # Decode other columns to join details
            decoded_data = self.data.copy()
            decoded_data['College_Name'] = self.encoders['College_Name'].inverse_transform(decoded_data['College_Name'])
            decoded_data['Branch_Name'] = self.encoders['Branch_Name'].inverse_transform(decoded_data['Branch_Name'])
            decoded_data['Location'] = self.encoders['Location'].inverse_transform(decoded_data['Location'])
            unique_colleges = decoded_data[['College_Name', 'Branch_Name', 'Location']].drop_duplicates()

            full_results = full_results.merge(unique_colleges, on='College_Name', how='left')

            # Filter by branch and location (only if location != 'All')
            filtered = full_results[full_results['Branch_Name'] == branch]
            if location != "All":
                filtered = filtered[filtered['Location'] == location]

            if filtered.empty:
                print("⚠️ No matching colleges found for the given branch and location.")
                return []

            filtered['Probability'] = filtered['Probability'] / filtered['Probability'].sum()
            filtered = filtered.sort_values(by='Probability', ascending=False).head(15)
            filtered['Probability'] = 80 + (filtered['Probability'] * 19)  # Scale to 80-99 range

            result = list(zip(filtered['College_Name'], filtered['Probability'].round(2)))
            print("✅ Predictions generated successfully!")
            return result

        except Exception as e:
            print(f"❌ Error in prediction: {e}")
            return []

# Example usage
if __name__ == "__main__":
    predictor = CollegePredictionSystem()
    if predictor.load_data('College_Category_Score_Summary.csv'):
        predictor.train_model()
        predictions = predictor.predict_colleges('OPEN', 85.5, 'Computer Engineering', 'Pune')
        for idx, (college, prob) in enumerate(predictions, 1):
            print(f"{idx}. 🏫 College: {college}, Chance: {prob:.2f}%")





In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import LabelBinarizer
from sklearn.ensemble import GradientBoostingClassifier
import joblib
import numpy as np

# Load trained model (optional if you're using it in the same script)
gbc_model = joblib.load('college_admission_model.pkl')

# Predict probabilities
y_score = gbc_model.predict_proba(X_test)

# For multiclass, binarize the output labels
lb = LabelBinarizer()
y_test_binarized = lb.fit_transform(y_test)

# Calculate ROC and AUC for each class, then average (macro)
fpr = dict()
tpr = dict()
roc_auc = dict()
n_classes = y_test_binarized.shape[1]

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_binarized[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test_binarized.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

# Plot the ROC curve (micro-average)
plt.figure(figsize=(8, 6))
plt.plot(fpr["micro"], tpr["micro"],
         label='GradientBoostingClassifier (AUC = {:.2f})'.format(roc_auc["micro"]),
         color='blue', linewidth=2)

# Plot the random guessing line
plt.plot([0, 1], [0, 1], linestyle='--', color='black')

# Plot settings
plt.title('ROC Curve - Gradient Boosting Classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, LabelBinarizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import joblib

class CollegePredictionSystem:
    def __init__(self):
        self.data = None
        self.model = GradientBoostingClassifier(n_estimators=50, random_state=42)
        self.encoders = {}
        self.scaler = MinMaxScaler()

    def load_data(self, csv_file):
        try:
            self.data = pd.read_csv(csv_file, encoding="windows-1252", on_bad_lines='skip')
            self.data['College_Name'] = self.data['College_Name'].str.strip()
            required_columns = ['College_Name', 'Branch_Name', 'Location', 'OPEN_Score']
            missing_columns = [col for col in required_columns if col not in self.data.columns]
            if missing_columns:
                print(f"❌ Error: Missing required columns: {missing_columns}")
                return False
            print(f"✅ Successfully loaded {len(self.data)} records from {csv_file}")
            return True
        except Exception as e:
            print(f"❌ Error loading CSV file: {e}")
            return False

    def transform_scores_by_category(self):
        score_columns = {
            'OPEN': 'OPEN_Score',
            'OBC': 'OBC_Score',
            'SC': 'SC_Score',
            'ST': 'ST_Score',
            'SBC': 'SBC_Score',
            'DT/VJ': 'DT/VJ_Score'
        }
        dfs = []
        for category, col in score_columns.items():
            if col not in self.data.columns:
                continue
            temp_df = self.data[['College_Name', 'Branch_Name', 'Location', col]].copy()
            temp_df = temp_df.rename(columns={col: 'MHT_CET_Score'})
            temp_df['Category'] = category
            temp_df = temp_df.dropna(subset=['MHT_CET_Score'])
            dfs.append(temp_df)
        self.data = pd.concat(dfs, ignore_index=True)

    def preprocess_data(self):
        if self.data is None:
            raise ValueError("❌ No data loaded. Please load data using load_data() first.")
        categorical_columns = ['Category', 'Branch_Name', 'College_Name', 'Location']
        for col in categorical_columns:
            self.encoders[col] = LabelEncoder()
            self.data[col] = self.encoders[col].fit_transform(self.data[col].astype(str))
        self.data['MHT_CET_Score'] = self.scaler.fit_transform(self.data[['MHT_CET_Score']])
        print("✅ Data preprocessing completed!")

    def train_model(self):
        self.transform_scores_by_category()
        self.preprocess_data()
        features = ['Category', 'MHT_CET_Score', 'Branch_Name', 'Location']
        target = 'College_Name'
        X = self.data[features]
        y = self.data[target]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        self.model.fit(X_train, y_train)
        y_pred = self.model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        print(f"\n✅ Model trained with accuracy: {accuracy:.2f}")
        print("\n📊 Classification Report:")
        print(classification_report(y_test, y_pred))
        unique_labels = np.unique(np.concatenate((y_test, y_pred)))
        cm = confusion_matrix(y_test, y_pred, labels=unique_labels)
        test_labels, counts = np.unique(y_test, return_counts=True)
        top_10_classes = test_labels[np.argsort(counts)][-10:]
        class_index_map = {label: i for i, label in enumerate(unique_labels)}
        top_indices = [class_index_map[label] for label in top_10_classes if label in class_index_map]
        filtered_cm = cm[np.ix_(top_indices, top_indices)]
        plt.figure(figsize=(10, 8))
        sns.heatmap(filtered_cm, annot=True, fmt='d', cmap='Blues',
                    xticklabels=top_10_classes, yticklabels=top_10_classes)
        plt.title("Top 10 College Confusion Matrix")
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plt.show()
        per_class_accuracy = cm.diagonal() / cm.sum(axis=1)
        college_names = self.encoders['College_Name'].inverse_transform(unique_labels)
        accuracy_df = pd.DataFrame({
            'College_Name': college_names,
            'Accuracy (%)': (per_class_accuracy * 100).round(2)
        }).sort_values(by='Accuracy (%)', ascending=False)
        print("\n📈 Top 10 Classes by Accuracy:")
        print(accuracy_df.head(10).to_string(index=False))
        print("\n📉 Bottom 10 Classes by Accuracy:")
        print(accuracy_df.tail(10).to_string(index=False))
        joblib.dump(self.model, 'college_predictor_model_v2.pkl')
        print("✅ Model saved as 'college_predictor_model_v2.pkl'")
        for col in self.encoders:
            joblib.dump(self.encoders[col], f'{col}_encoder.pkl')
        joblib.dump(self.scaler, 'score_scaler.pkl')
        print("✅ Encoders and scaler saved.")

    def predict_colleges(self, category, score, branch, location):
        try:
            scaled_score = self.scaler.transform([[score]])[0][0]
            encoded_input = {
                'Category': self.encoders['Category'].transform([category])[0],
                'MHT_CET_Score': scaled_score,
                'Branch_Name': self.encoders['Branch_Name'].transform([branch])[0],
                'Location': self.encoders['Location'].transform([location])[0] if location != "All" else -1
            }
            input_df = pd.DataFrame([encoded_input])
            probabilities = self.model.predict_proba(input_df)[0]
            all_colleges = self.encoders['College_Name'].inverse_transform(np.arange(len(probabilities)))
            full_results = pd.DataFrame({
                'College_Name': all_colleges,
                'Probability': probabilities
            })
            decoded_data = self.data.copy()
            decoded_data['College_Name'] = self.encoders['College_Name'].inverse_transform(decoded_data['College_Name'])
            decoded_data['Branch_Name'] = self.encoders['Branch_Name'].inverse_transform(decoded_data['Branch_Name'])
            decoded_data['Location'] = self.encoders['Location'].inverse_transform(decoded_data['Location'])
            unique_colleges = decoded_data[['College_Name', 'Branch_Name', 'Location']].drop_duplicates()
            full_results = full_results.merge(unique_colleges, on='College_Name', how='left')
            filtered = full_results[full_results['Branch_Name'] == branch]
            if location != "All":
                filtered = filtered[filtered['Location'] == location]
            if filtered.empty:
                print("⚠️ No matching colleges found for the given branch and location.")
                return []
            filtered['Probability'] = filtered['Probability'] / filtered['Probability'].sum()
            filtered = filtered.sort_values(by='Probability', ascending=False).head(15)
            filtered['Probability'] = 80 + (filtered['Probability'] * 19)
            result = list(zip(filtered['College_Name'], filtered['Probability'].round(2)))
            print("✅ Predictions generated successfully!")
            return result
        except Exception as e:
            print(f"❌ Error in prediction: {e}")
            return []

if __name__ == "__main__":
    predictor = CollegePredictionSystem()
    if predictor.load_data('College_Category_Score_Summary.csv'):
        predictor.train_model()
        predictions = predictor.predict_colleges('OPEN', 85.5, 'Computer Engineering', 'Pune')
        for idx, (college, prob) in enumerate(predictions, 1):
            print(f"{idx}. 🏫 College: {college}, Chance: {prob:.2f}%")

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, LabelBinarizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc
import seaborn as sns
import matplotlib.pyplot as plt
import joblib

class CollegePredictionSystem:
    def __init__(self):
        self.data = None
        self.model = GradientBoostingClassifier(n_estimators=50, random_state=42)
        self.encoders = {}
        self.scaler = MinMaxScaler()

    def load_data(self, csv_file):
        try:
            self.data = pd.read_csv(csv_file, encoding="windows-1252", on_bad_lines='skip')
            self.data['College_Name'] = self.data['College_Name'].str.strip()
            required_columns = ['College_Name', 'Branch_Name', 'Location', 'OPEN_Score']
            missing_columns = [col for col in required_columns if col not in self.data.columns]
            if missing_columns:
                print(f"❌ Error: Missing required columns: {missing_columns}")
                return False
            print(f"✅ Successfully loaded {len(self.data)} records from {csv_file}")
            return True
        except Exception as e:
            print(f"❌ Error loading CSV file: {e}")
            return False

    def transform_scores_by_category(self):
        score_columns = {
            'OPEN': 'OPEN_Score',
            'OBC': 'OBC_Score',
            'SC': 'SC_Score',
            'ST': 'ST_Score',
            'SBC': 'SBC_Score',
            'DT/VJ': 'DT/VJ_Score'
        }
        dfs = []
        for category, col in score_columns.items():
            if col not in self.data.columns:
                continue
            temp_df = self.data[['College_Name', 'Branch_Name', 'Location', col]].copy()
            temp_df = temp_df.rename(columns={col: 'MHT_CET_Score'})
            temp_df['Category'] = category
            temp_df = temp_df.dropna(subset=['MHT_CET_Score'])
            dfs.append(temp_df)
        self.data = pd.concat(dfs, ignore_index=True)

    def preprocess_data(self):
        if self.data is None:
            raise ValueError("❌ No data loaded. Please load data using load_data() first.")
        categorical_columns = ['Category', 'Branch_Name', 'College_Name', 'Location']
        for col in categorical_columns:
            self.encoders[col] = LabelEncoder()
            self.data[col] = self.encoders[col].fit_transform(self.data[col].astype(str))
        self.data['MHT_CET_Score'] = self.scaler.fit_transform(self.data[['MHT_CET_Score']])
        print("✅ Data preprocessing completed!")

    def train_model(self):
        self.transform_scores_by_category()
        self.preprocess_data()
        features = ['Category', 'MHT_CET_Score', 'Branch_Name', 'Location']
        target = 'College_Name'
        X = self.data[features]
        y = self.data[target]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        self.model.fit(X_train, y_train)
        y_pred = self.model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        print(f"\n✅ Model trained with accuracy: {accuracy:.2f}")
        print("\n📊 Classification Report:")
        #print(classification_report(y_test, y_pred))
        unique_labels = np.unique(np.concatenate((y_test, y_pred)))
        cm = confusion_matrix(y_test, y_pred, labels=unique_labels)
        test_labels, counts = np.unique(y_test, return_counts=True)
        top_5_classes = test_labels[np.argsort(counts)][-5:]
        class_index_map = {label: i for i, label in enumerate(unique_labels)}
        top_indices = [class_index_map[label] for label in top_5_classes if label in class_index_map]
        filtered_cm = cm[np.ix_(top_indices, top_indices)]
        plt.figure(figsize=(10, 8))
        sns.heatmap(filtered_cm, annot=True, fmt='d', cmap='Blues',
                    xticklabels=top_5_classes, yticklabels=top_5_classes)
        plt.title("Top 5 College Confusion Matrix")
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plt.show()
        per_class_accuracy = cm.diagonal() / cm.sum(axis=1)
        college_names = self.encoders['College_Name'].inverse_transform(unique_labels)
        accuracy_df = pd.DataFrame({
            'College_Name': college_names,
            'Accuracy (%)': (per_class_accuracy * 100).round(2)
        }).sort_values(by='Accuracy (%)', ascending=False)
        print("\n📈 Top 5 Classes by Accuracy:")
        print(accuracy_df.head(5).to_string(index=False))
        print("\n📉 Bottom 5 Classes by Accuracy:")
        print(accuracy_df.tail(5).to_string(index=False))

        # ROC Curve
        lb = LabelBinarizer()
        y_test_bin = lb.fit_transform(y_test)
        if y_test_bin.shape[1] == 1:
            y_test_bin = np.hstack((1 - y_test_bin, y_test_bin))
        y_score = self.model.predict_proba(X_test)
        fpr = dict()
        tpr = dict()
        roc_auc = dict()
        for i in range(y_test_bin.shape[1]):
            fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_score[:, i])
            roc_auc[i] = auc(fpr[i], tpr[i])
        plt.figure(figsize=(10, 6))
        for i in range(min(5, y_test_bin.shape[1])):
            plt.plot(fpr[i], tpr[i], label=f'Class {i} (AUC = {roc_auc[i]:.2f})')
        plt.plot([0, 1], [0, 1], 'k--')
        plt.title('ROC Curve (Top 5 Classes)')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.legend(loc='lower right')
        plt.grid(True)
        plt.show()

        joblib.dump(self.model, 'college_predictor_model_v2.pkl')
        print("✅ Model saved as 'college_predictor_model_v2.pkl'")
        for col in self.encoders:
            joblib.dump(self.encoders[col], f'{col}_encoder.pkl')
        joblib.dump(self.scaler, 'score_scaler.pkl')
        print("✅ Encoders and scaler saved.")

    def predict_colleges(self, category, score, branch, location):
        try:
            scaled_score = self.scaler.transform([[score]])[0][0]
            encoded_input = {
                'Category': self.encoders['Category'].transform([category])[0],
                'MHT_CET_Score': scaled_score,
                'Branch_Name': self.encoders['Branch_Name'].transform([branch])[0],
                'Location': self.encoders['Location'].transform([location])[0] if location != "All" else -1
            }
            input_df = pd.DataFrame([encoded_input])
            probabilities = self.model.predict_proba(input_df)[0]
            all_colleges = self.encoders['College_Name'].inverse_transform(np.arange(len(probabilities)))
            full_results = pd.DataFrame({
                'College_Name': all_colleges,
                'Probability': probabilities
            })
            decoded_data = self.data.copy()
            decoded_data['College_Name'] = self.encoders['College_Name'].inverse_transform(decoded_data['College_Name'])
            decoded_data['Branch_Name'] = self.encoders['Branch_Name'].inverse_transform(decoded_data['Branch_Name'])
            decoded_data['Location'] = self.encoders['Location'].inverse_transform(decoded_data['Location'])
            unique_colleges = decoded_data[['College_Name', 'Branch_Name', 'Location']].drop_duplicates()
            full_results = full_results.merge(unique_colleges, on='College_Name', how='left')
            filtered = full_results[full_results['Branch_Name'] == branch]
            if location != "All":
                filtered = filtered[filtered['Location'] == location]
            if filtered.empty:
                print("⚠️ No matching colleges found for the given branch and location.")
                return []
            filtered['Probability'] = filtered['Probability'] / filtered['Probability'].sum()
            filtered = filtered.sort_values(by='Probability', ascending=False).head(15)
            filtered['Probability'] = 80 + (filtered['Probability'] * 19)
            result = list(zip(filtered['College_Name'], filtered['Probability'].round(2)))
            print("✅ Predictions generated successfully!")
            return result
        except Exception as e:
            print(f"❌ Error in prediction: {e}")
            return []

if __name__ == "__main__":
    predictor = CollegePredictionSystem()
    if predictor.load_data('College_Category_Score_Summary_new.csv'):
        predictor.train_model()
        predictions = predictor.predict_colleges('OPEN', 85.5, 'Computer Engineering', 'Pune')
        for idx, (college, prob) in enumerate(predictions, 1):
            print(f"{idx}. 🏫 College: {college}, Chance: {prob:.2f}%")
