<a href="https://colab.research.google.com/github/Sathvik2954/Infosys_Edu2Job_Predicting_Job_Roles_from_Educational_Background/blob/main/Milestone_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
file_name = 'processed_student_job_data.csv'
try:
    df = pd.read_csv(file_name)
except FileNotFoundError:
    print(f"Error: Make sure '{file_name}' is uploaded to your Google Colab session.")
else:
    print("Data loaded successfully.")
    target_column = 'Job/Higher Studies?'
    X = df.drop(target_column, axis=1)
    y = df[target_column]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    categorical_features = [
        'interested career area ',
        'Type of company want to settle in?'
    ]
    boolean_features = [
        'Interested subjects_Computer Architecture', 'Interested subjects_IOT',
        'Interested subjects_Management', 'Interested subjects_Software Engineering',
        'Interested subjects_cloud computing', 'Interested subjects_data engineering',
        'Interested subjects_hacking', 'Interested subjects_networks',
        'Interested subjects_parallel computing', 'Interested subjects_programming'
    ]
    numeric_features = [
        col for col in X.columns
        if col not in categorical_features
        and col not in boolean_features
    ]

    print(f"Identified {len(numeric_features)} numeric features.")
    print(f"Identified {len(categorical_features) + len(boolean_features)} categorical/boolean features.")
    numeric_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])
    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features + boolean_features)
        ])
    models = {
        "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
        "KNN": KNeighborsClassifier(),
        "Decision Tree": DecisionTreeClassifier(random_state=42),
        "SVM": SVC(random_state=42),
        "Random Forest": RandomForestClassifier(random_state=42),
        "AdaBoost": AdaBoostClassifier(random_state=42),
        "Gradient Boosting": GradientBoostingClassifier(random_state=42),
        "XGBoosting": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
    }
    results = {}

    for model_name, model in models.items():
        print(f"\n--- Training {model_name} ---")
        clf = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('classifier', model)
        ])
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        results[model_name] = accuracy
        print(f"Accuracy: {accuracy:.4f} ({(accuracy * 100):.2f}%)")
        print("Classification Report:")
        print(classification_report(y_test, y_pred))
        print("-" * 30)
    print("\n--- Model Accuracy Summary ---")
    sorted_results = sorted(results.items(), key=lambda item: item[1], reverse=True)
    for model_name, accuracy in sorted_results:
        print(f"{model_name}: {accuracy:.4f} ({(accuracy * 100):.2f}%)")

Data loaded successfully.
Identified 33 numeric features.
Identified 12 categorical/boolean features.

--- Training Logistic Regression ---
Accuracy: 0.5078 (50.78%)
Classification Report:
              precision    recall  f1-score   support

           0       0.51      0.55      0.53      2014
           1       0.50      0.46      0.48      1986

    accuracy                           0.51      4000
   macro avg       0.51      0.51      0.51      4000
weighted avg       0.51      0.51      0.51      4000

------------------------------

--- Training KNN ---
Accuracy: 0.5028 (50.28%)
Classification Report:
              precision    recall  f1-score   support

           0       0.51      0.50      0.50      2014
           1       0.50      0.50      0.50      1986

    accuracy                           0.50      4000
   macro avg       0.50      0.50      0.50      4000
weighted avg       0.50      0.50      0.50      4000

------------------------------

--- Training Decision T

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.4960 (49.60%)
Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.50      0.50      2014
           1       0.49      0.49      0.49      1986

    accuracy                           0.50      4000
   macro avg       0.50      0.50      0.50      4000
weighted avg       0.50      0.50      0.50      4000

------------------------------

--- Model Accuracy Summary ---
SVM: 0.5132 (51.32%)
Logistic Regression: 0.5078 (50.78%)
AdaBoost: 0.5075 (50.75%)
KNN: 0.5028 (50.28%)
Gradient Boosting: 0.5015 (50.15%)
Random Forest: 0.4995 (49.95%)
Decision Tree: 0.4973 (49.73%)
XGBoosting: 0.4960 (49.60%)
