<a href="https://colab.research.google.com/github/PrachiTawar/agri-disease-detector/blob/main/student_career_prec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Student Career Success Prediction System
# Complete pipeline with synthetic data, Random Forest model, and Gradio interface

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import gradio as gr
import warnings
warnings.filterwarnings('ignore')

In [3]:
# ========================
# 1. SYNTHETIC DATA GENERATION
# ========================

def generate_synthetic_data(n_samples=1000):
    """Generate synthetic student career data"""
    np.random.seed(42)

    data = {
        'age': np.random.randint(18, 35, n_samples),
        'highest_education': np.random.choice([
            'High School', 'Diploma', 'Bachelor', 'Master', 'PhD'
        ], n_samples, p=[0.15, 0.20, 0.40, 0.20, 0.05]),
        'cgpa': np.round(np.random.uniform(2.0, 4.0, n_samples), 2),
        'gender': np.random.choice(['Male', 'Female', 'Other'], n_samples, p=[0.48, 0.48, 0.04]),
        'skills': np.random.randint(1, 11, n_samples),  # 1-10 scale
        'risk_taking_ability': np.random.randint(1, 11, n_samples),  # 1-10 scale
        'networking_skills': np.random.randint(1, 11, n_samples),  # 1-10 scale
        'career_goal': np.random.choice([
            'Software Engineer', 'Data Scientist', 'Business Analyst',
            'Manager', 'Entrepreneur', 'Researcher', 'Consultant'
        ], n_samples)
    }

    df = pd.DataFrame(data)

    # Create target variable based on logical rules
    def determine_success(row):
        score = 0

        # Education weight
        edu_weight = {'High School': 1, 'Diploma': 2, 'Bachelor': 3, 'Master': 4, 'PhD': 5}
        score += edu_weight[row['highest_education']] * 10

        # CGPA weight
        score += row['cgpa'] * 10

        # Skills weight
        score += row['skills'] * 3

        # Risk taking weight
        score += row['risk_taking_ability'] * 2

        # Networking weight
        score += row['networking_skills'] * 2.5

        # Age factor (optimal age 22-28)
        if 22 <= row['age'] <= 28:
            score += 10
        elif row['age'] < 22:
            score += 5

        # Career goal specific adjustments
        if row['career_goal'] in ['Software Engineer', 'Data Scientist']:
            if row['skills'] >= 7:
                score += 5
        elif row['career_goal'] == 'Entrepreneur':
            if row['risk_taking_ability'] >= 7:
                score += 5
        elif row['career_goal'] in ['Manager', 'Consultant']:
            if row['networking_skills'] >= 7:
                score += 5

        # Add some randomness
        score += np.random.uniform(-10, 10)

        # Determine success (threshold around 110)
        return 1 if score >= 110 else 0

    df['career_success'] = df.apply(determine_success, axis=1)

    return df


In [4]:
# ========================
# 2. DATA PREPROCESSING
# ========================

def preprocess_data(df):
    """Preprocess the data for model training"""
    df_processed = df.copy()

    # Encode categorical variables
    le_education = LabelEncoder()
    le_gender = LabelEncoder()
    le_career = LabelEncoder()

    df_processed['highest_education_encoded'] = le_education.fit_transform(df_processed['highest_education'])
    df_processed['gender_encoded'] = le_gender.fit_transform(df_processed['gender'])
    df_processed['career_goal_encoded'] = le_career.fit_transform(df_processed['career_goal'])

    # Select features for training
    feature_columns = [
        'age', 'highest_education_encoded', 'cgpa', 'gender_encoded',
        'skills', 'risk_taking_ability', 'networking_skills', 'career_goal_encoded'
    ]

    X = df_processed[feature_columns]
    y = df_processed['career_success']

    return X, y, le_education, le_gender, le_career


In [5]:
# ========================
# 3. MODEL TRAINING
# ========================

def train_model(X, y):
    """Train Random Forest model"""
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Train Random Forest
    model = RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=42,
        n_jobs=-1
    )

    model.fit(X_train, y_train)

    # Evaluate
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    print("=" * 50)
    print("MODEL TRAINING COMPLETE")
    print("=" * 50)
    print(f"\nAccuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=['Will Not Reach', 'Will Reach']))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nFeature Importances:")
    feature_names = [
        'Age', 'Education', 'CGPA', 'Gender',
        'Skills', 'Risk Taking', 'Networking', 'Career Goal'
    ]
    for name, importance in zip(feature_names, model.feature_importances_):
        print(f"{name}: {importance:.4f}")

    return model, X_train, X_test, y_train, y_test


In [6]:

# ========================
# 4. PREDICTION FUNCTION
# ========================

def predict_career_success(age, education, cgpa, gender, skills,
                          risk_taking, networking, career_goal,
                          model, le_education, le_gender, le_career):
    """Predict career success for a student"""

    # Encode inputs
    education_encoded = le_education.transform([education])[0]
    gender_encoded = le_gender.transform([gender])[0]
    career_encoded = le_career.transform([career_goal])[0]

    # Create feature array
    features = np.array([[
        age, education_encoded, cgpa, gender_encoded,
        skills, risk_taking, networking, career_encoded
    ]])

    # Predict
    prediction = model.predict(features)[0]
    probability = model.predict_proba(features)[0]

    # Format output
    result = "✅ YES - Likely to reach career goal" if prediction == 1 else "❌ NO - May face challenges"
    confidence = f"{max(probability) * 100:.2f}%"

    # Additional insights
    insights = []
    if cgpa < 2.5:
        insights.append("⚠️ Low CGPA may be a limiting factor")
    if skills < 5:
        insights.append("⚠️ Consider improving technical skills")
    if networking < 5:
        insights.append("⚠️ Networking skills need development")
    if risk_taking < 4 and career_goal == "Entrepreneur":
        insights.append("⚠️ Entrepreneurship requires higher risk tolerance")
    if not insights:
        insights.append("✨ Strong profile across all dimensions!")

    insight_text = "\n".join(insights)

    return result, confidence, insight_text

In [7]:
# ========================
# 5. MAIN EXECUTION
# ========================

print("Generating synthetic data...")
df = generate_synthetic_data(1000)
print(f"Generated {len(df)} samples")
print("\nData Distribution:")
print(df['career_success'].value_counts())
print("\nSample Data:")
print(df.head())

print("\n\nPreprocessing data...")
X, y, le_education, le_gender, le_career = preprocess_data(df)

print("\n\nTraining Random Forest model...")
model, X_train, X_test, y_train, y_test = train_model(X, y)


Generating synthetic data...
Generated 1000 samples

Data Distribution:
career_success
0    575
1    425
Name: count, dtype: int64

Sample Data:
   age highest_education  cgpa  gender  skills  risk_taking_ability  \
0   24               PhD  3.17  Female       6                    7   
1   32       High School  3.17  Female       2                    3   
2   28          Bachelor  3.40    Male      10                    6   
3   25           Diploma  3.86  Female       8                    9   
4   24          Bachelor  3.57    Male       7                    7   

   networking_skills   career_goal  career_success  
0                 10    Researcher               1  
1                  3       Manager               0  
2                  7  Entrepreneur               1  
3                  4    Researcher               1  
4                  3       Manager               0  


Preprocessing data...


Training Random Forest model...
MODEL TRAINING COMPLETE

Accuracy: 0.8800

Classific

In [9]:
# ========================
# 6. GRADIO INTERFACE
# ========================

def gradio_predict(age, education, cgpa, gender, skills, risk_taking, networking, career_goal):
    """Wrapper function for Gradio"""
    return predict_career_success(
        age, education, cgpa, gender, skills, risk_taking,
        networking, career_goal, model, le_education, le_gender, le_career
    )

# Create Gradio interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # 🎓 Student Career Success Predictor
    ### Predict whether a student will reach their career goals using AI
    """)

    with gr.Row():
        with gr.Column():
            age_input = gr.Slider(18, 35, value=22, label="Age", step=1)
            education_input = gr.Dropdown(
                choices=['High School', 'Diploma', 'Bachelor', 'Master', 'PhD'],
                value='Bachelor',
                label="Highest Education"
            )
            cgpa_input = gr.Slider(2.0, 4.0, value=3.0, label="CGPA", step=0.1)
            gender_input = gr.Radio(
                choices=['Male', 'Female', 'Other'],
                value='Male',
                label="Gender"
            )

        with gr.Column():
            skills_input = gr.Slider(1, 10, value=7, label="Technical Skills (1-10)", step=1)
            risk_input = gr.Slider(1, 10, value=5, label="Risk Taking Ability (1-10)", step=1)
            network_input = gr.Slider(1, 10, value=6, label="Networking Skills (1-10)", step=1)
            career_input = gr.Dropdown(
                choices=['Software Engineer', 'Data Scientist', 'Business Analyst',
                        'Manager', 'Entrepreneur', 'Researcher', 'Consultant'],
                value='Software Engineer',
                label="Career Goal"
            )

    predict_btn = gr.Button("🔮 Predict Career Success", variant="primary")

    with gr.Row():
        with gr.Column():
            result_output = gr.Textbox(label="Prediction Result", lines=2)
        with gr.Column():
            confidence_output = gr.Textbox(label="Confidence")

    insights_output = gr.Textbox(label="Insights & Recommendations", lines=4)

    predict_btn.click(
        fn=gradio_predict,
        inputs=[age_input, education_input, cgpa_input, gender_input,
                skills_input, risk_input, network_input, career_input],
        outputs=[result_output, confidence_output, insights_output]
    )

    gr.Markdown("""
    ---
    **Note:** This model is trained on synthetic data and is for educational purposes only.
    Predictions should be used as guidance, not absolute truth.
    """)

# Launch the interface
print("\n\n" + "=" * 50)
print("LAUNCHING GRADIO INTERFACE")
print("=" * 50)
demo.launch(debug=True)



LAUNCHING GRADIO INTERFACE
It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().


KeyboardInterrupt: 

In [13]:
joblib.dump({...}, 'career_prediction_model.pkl')


NameError: name 'joblib' is not defined