<a href="https://colab.research.google.com/github/StrikingJaysingpure/Path-Finders-SIH/blob/AI-Model/SIH_without_skills.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing Libraries

In [None]:
%pip install catboost



In [None]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Importing the Dataset

In [None]:
df = pd.read_csv("withoutskills_updated.csv")
X = df.drop(columns=[ "Company Name (PMIS Partner)","Position/Role Offered"])
y2 = df["Position/Role Offered"]
y1=df["Company Name (PMIS Partner)"]


# Encoding Categorical Data


In [None]:
label_enc1 = LabelEncoder()
label_enc2 = LabelEncoder()
y_encoded1 = label_enc1.fit_transform(y1)
y_encoded2 = label_enc2.fit_transform(y2)
y_series1 = pd.Series(y_encoded1, index=X.index, name="company")
y_series2 = pd.Series(y_encoded2, index=X.index, name="role")



# Taking Care of Missing Data

In [None]:
for col in X.columns:
    if X[col].dtype == "object":
        X[col] = X[col].fillna("Unknown")

#Categorical Features

In [None]:
cat_features = [col for col in X.columns if X[col].dtype == "object"]


# Dividing the Dataset into Training Set and Test Set

In [None]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(
    X, y_encoded1, test_size=0.2, random_state=42, stratify=y_encoded1
)
X_train2, X_test2, y_train2, y_test2 = train_test_split(
    X, y_encoded1, test_size=0.2, random_state=42, stratify=y_encoded1
)

In [None]:
X_train, X_test, y_train2, y_test2 = train_test_split(
    X, y_encoded2, test_size=0.2, random_state=42, stratify=y_encoded2
)

In [None]:
# from sklearn.model_selection import train_test_split

X_train, X_test, y_train1, y_test1 = train_test_split(
    X, y_series1, test_size=0.2, random_state=42, stratify=y_series1
)

# Align y2 with same indices
y_train2 = y_series2.loc[y_train1.index]
y_test2  = y_series2.loc[y_test1.index]

# CatBoost Model

In [None]:

model1 = CatBoostClassifier(
    iterations=80,           # fewer iterations
    depth=5,                 # shallower trees
    learning_rate=0.2,       # faster convergence
    loss_function="MultiClass",
    eval_metric="Accuracy",
    verbose=0,
    random_seed=42,
    early_stopping_rounds=10
)
model2 = CatBoostClassifier(
    iterations=80,           # fewer iterations
    depth=5,                 # shallower trees
    learning_rate=0.2,       # faster convergence
    loss_function="MultiClass",
    eval_metric="Accuracy",
    verbose=0,
    random_seed=42,
    early_stopping_rounds=10
)

# Training Model

In [None]:
model1.fit(X_train, y_train1, cat_features=cat_features, eval_set=(X_test, y_test1))

<catboost.core.CatBoostClassifier at 0x7a3348ffd610>

In [None]:
model2.fit(X_train, y_train2, cat_features=cat_features, eval_set=(X_test, y_test2))


<catboost.core.CatBoostClassifier at 0x7a3348ffd2e0>

#predicting top 3 internships

In [None]:
probs1 = model1.predict_proba(X_test)
top3_preds1 = np.argsort(probs1, axis=1)[:, -3:][:, ::-1]
top3_roles1 = [[label_enc1.inverse_transform([i])[0] for i in row] for row in top3_preds1]
probs2 = model2.predict_proba(X_test)
top3_preds2 = np.argsort(probs2, axis=1)[:, -3:][:, ::-1]
top3_roles2 = [[label_enc2.inverse_transform([i])[0] for i in row] for row in top3_preds2]


# Show 5 sample recommendations

In [None]:
for i in range(5):
    print(f"\nCandidate {i+1} Profile:")
    print(X_test.iloc[i].to_dict())
    print(top3_roles1[i])
    print( top3_roles2[i])

for i in range(5):
    print(f"\nCandidate {i+1} Profile:")
    print(X_test.iloc[i].to_dict())
    print("Top-3 Internship Company Suggestions:", top3_roles2[i])


Candidate 1 Profile:
{'Age': 22, 'Level of Education': 'Diploma', 'Stream (if 12th pass)': 'Science', 'Degree (if graduate/diploma)': 'Diploma', 'Stream of Degree': 'Computer Science', 'Location of Applicant': 'Chandigarh, Chandigarh'}
['Mercedes-Benz India', 'Mahindra & Mahindra', 'Max Life Insurance']
['Mechanical Engineering Intern', 'Embedded Systems Intern', 'Automotive Engineering Intern']

Candidate 2 Profile:
{'Age': 24, 'Level of Education': 'Graduate', 'Stream (if 12th pass)': 'Science', 'Degree (if graduate/diploma)': 'Diploma', 'Stream of Degree': 'Finance', 'Location of Applicant': 'Thiruvananthapuram, Kerala'}
['HDFC Bank', 'Alembic Pharmaceuticals', 'Reliance Industries Limited']
['Finance Operations Intern', 'Regulatory Affairs Intern', 'Marketing Intern']

Candidate 3 Profile:
{'Age': 23, 'Level of Education': '12th Pass', 'Stream (if 12th pass)': 'Arts', 'Degree (if graduate/diploma)': 'Unknown', 'Stream of Degree': 'Unknown', 'Location of Applicant': 'Jaipur, Rajast

#Accuracy

In [None]:
from sklearn.metrics import accuracy_score

# ---- Top-1 Accuracy ----
y_pred1 = model1.predict(X_test)
y_pred2 = model2.predict(X_test)
top1_acc1 = accuracy_score(y_test1, y_pred1)
print("Top-1 Accuracy:", round(top1_acc1, 3))

# ---- Top-3 Accuracy ----
correct = 0
for i, true_label in enumerate(y_test1):
    if true_label in top3_preds1[i]:  # check if true label is in top 3 predictions
        correct += 1

top3_acc1 = correct / len(y_test1)
print("Top-3 Accuracy:", round(top3_acc1, 3))

Top-1 Accuracy: 0.75
Top-3 Accuracy: 0.94


In [None]:
# for y1
from sklearn.metrics import confusion_matrix,accuracy_score
cm=confusion_matrix(y_pred1,y_test1)
print(cm)
accuracy_score(y_test1,y_pred1)

[[31  4  3  0  0  2  1]
 [ 0 28  0  0  0  0  0]
 [ 2  5 29  3  5  2  0]
 [ 7  2  0 38  2  3  0]
 [ 0  0  3  0 29  1  2]
 [ 0  0  6  0  5 34  4]
 [ 2  6  0  2  2  1 36]]


0.75

In [None]:
# for y2
from sklearn.metrics import confusion_matrix,accuracy_score
cm2=confusion_matrix(y_pred2,y_test2)
print(cm2)
accuracy_score(y_test2,y_pred2)

[[11  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  2  0  0  0  0  0  0
   0  0  0]
 [ 0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0]
 [ 0  0 18  3  3  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  0
   0  3  0]
 [ 0  0  1 11  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  2  0  0  0  0
   0  0  0]
 [ 0  3  0  0  7  0  3  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0]
 [ 0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0]
 [ 0  0  0  0  2  0  3  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0]
 [ 0  0  0  0  0  0  0  9  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0]
 [ 0  0  0  0  0  0  0  0  6  0  0  0  0  0  0  5  1  0  0  0  0  0  0  0
   0  0  0]
 [ 0  1  0  0  0  0  0  0  0  9  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  5  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0 10  0  0  0  0  0  

0.6966666666666667

In [None]:
# # ---- Top-1 Accuracy ----
# y_pred2 = model2.predict(X_test)
# top1_acc2 = accuracy_score(y_test2, y_pred2)
# print("Top-1 Accuracy:", round(top1_acc2, 3))

# # ---- Top-3 Accuracy ----
# correct = 0
# for i, true_label in enumerate(y_test1):
#     if true_label in top3_preds1[i]:  # check if true label is in top 3 predictions
#         correct += 1

# top3_acc2 = correct / len(y_test2)
# print("Top-3 Accuracy:", round(top3_acc2, 3))