In [25]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

In [26]:
data = pd.read_csv("/content/IT_Job_Roles_Skills.csv", encoding='latin1')
print(data)

                                             Job Title  \
0                                       Admin Big Data   
1                          Ansible Operations Engineer   
2                            Artifactory Administrator   
3    Artificial Intelligence / Machine Learning Leader   
4    Artificial Intelligence / Machine Learning Sr....   
..                                                 ...   
488                 EFFECTS TECHNICAL DIRECTOR (FX TD)   
489                                      LAYOUT ARTIST   
490                                      AI RESEARCHER   
491                              AI SOFTWARE ARCHITECT   
492                    BUSINESS INTELLIGENCE DEVELOPER   

                                       Job Description  \
0    Responsible for managing and overseeing big da...   
1    Focuses on automating IT processes using Ansib...   
2    Manages the Artifactory repository for build a...   
3    Leads AI/ML projects and teams, defining strat...   
4    Senior r

In [27]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 493 entries, 0 to 492
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Job Title        493 non-null    object
 1   Job Description  493 non-null    object
 2   Skills           493 non-null    object
 3   Certifications   493 non-null    object
dtypes: object(4)
memory usage: 15.5+ KB


In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(lowercase=True, stop_words='english')

In [29]:
from re import X
df = pd.DataFrame(data)
X = vectorizer.fit_transform(df["Skills"])
y = df["Job Title"]

In [30]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)

In [31]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test =train_test_split(X,y,test_size=0.3)

In [32]:
x_train.shape

(345, 565)

In [33]:
x_test.shape

(148, 565)

In [34]:
from sklearn.naive_bayes import GaussianNB
NB = GaussianNB()
NB.fit(x_train.toarray(),y_train)

In [None]:
models = {
    "Naive Bayes":        MultinomialNB(),
    "SVM":                SVC(kernel='linear', probability=True),
    "Random Forest":      RandomForestClassifier(n_estimators=100, random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "kNN":                KNeighborsClassifier(n_neighbors=3),
    "Decision Tree":      DecisionTreeClassifier(random_state=42),
}

# Train all models
for name, model in models.items():
    model.fit(X, y_encoded)



def predict_job(input_skills: str, top_n: int = 3):
    # TF-IDF transform
    input_vec = vectorizer.transform([input_skills])

    # --- Method 1: Cosine Similarity (baseline) ---
    similarities = cosine_similarity(input_vec, X)[0]
    top_indices = similarities.argsort()[::-1][:top_n]

    print("\nüìå METHOD 1: TF-IDF Cosine Similarity")
    print("-" * 40)
    for rank, idx in enumerate(top_indices, 1):
        score = similarities[idx] * 100
        bar = "‚ñà" * int(score / 5)
        print(f"  {rank}. {df['Job Title'][idx]}")
        print(f"     Match: {bar} {score:.1f}%")

    # --- Method 2: ML Models ---
    print("\nüìå METHOD 2: ML Model Predictions")
    print("-" * 40)

    model_results = {}
    for name, model in models.items():
        if hasattr(model, "predict_proba"):
            proba = model.predict_proba(input_vec)[0]
            top_idx = proba.argsort()[::-1][:top_n]
            results = [(le.inverse_transform([i])[0], proba[i] * 100) for i in top_idx]
        else:
            pred = model.predict(input_vec)[0]
            results = [(le.inverse_transform([pred])[0], 100.0)]

        model_results[name] = results
        top_title, top_score = results[0]
        print(f"\n  ü§ñ {name}:")
        for title, score in results:
            bar = "‚ñà" * int(score / 5)
            print(f"     ‚Üí {title}")
            print(f"        {bar} {score:.1f}%")

    # --- Voting: Best Overall ---
    print("\n" + "=" * 60)
    print("üèÜ BEST MATCH (Majority Vote)")
    print("=" * 60)

    votes = {}
    for name, results in model_results.items():
        top_title = results[0][0]
        votes[top_title] = votes.get(top_title, 0) + 1

    # Also add cosine similarity top result
    cos_top = df['Job Title'][top_indices[0]]
    votes[cos_top] = votes.get(cos_top, 0) + 1

    best = sorted(votes.items(), key=lambda x: x[1], reverse=True)
    for title, vote_count in best:
        stars = "‚≠ê" * vote_count
        print(f"  {stars} {title} ({vote_count} votes)")

    print(f"\n  ‚úÖ RECOMMENDED: {best[0][0]}")
    print("=" * 60)

    return best[0][0]


# ========== INTERACTIVE MODE ==========
print("\nüí° HOW TO USE:")
print("   Type your skills separated by commas")
print("   Type 'quit' to exit\n")

while True:
    print("\n" + "-" * 60)
    user_input = input("üéØ Enter your skills: ").strip()

    if user_input.lower() in ['quit', 'exit', 'q']:
        print("\nüëã Bye!\n")
        break

    if not user_input:
        print("‚ö†Ô∏è  Please enter at least one skill!")
        continue

    result = predict_job(user_input)



üí° HOW TO USE:
   Type your skills separated by commas
   Type 'quit' to exit


------------------------------------------------------------
üéØ Enter your skills: python

üìå METHOD 1: TF-IDF Cosine Similarity
----------------------------------------
  1. PYTHON ARCHITECT
     Match: ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà 36.0%
  2. Entry Level Programmer
     Match: ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà 31.4%
  3. FULL STACK PYTHON DEVELOPER/PROGRAMMER/ENGINEER
     Match: ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà 30.3%

üìå METHOD 2: ML Model Predictions
----------------------------------------

  ü§ñ Naive Bayes:
     ‚Üí Senior DevOps Engineer
         0.6%
     ‚Üí DevOps Engineer
         0.6%
     ‚Üí Machine Learning Engineer
         0.5%

  ü§ñ SVM:
     ‚Üí BUSINESS SYSTEMS ANALYST
         0.8%
     ‚Üí INFORMATION ARCHITECT
         0.8%
     ‚Üí MOBILE APP DEVELOPER
         0.7%

  ü§ñ Random Forest:
     ‚Üí PYTHON ARCHITECT
        ‚ñà‚ñà‚ñà 16.0%
     ‚Üí NATURAL LANGUAGE PROCESSING ENGINEER
        ‚ñà 6.0%
     ‚Ü

In [None]:
import pickle

with open("job_matcher.pkl", "wb") as f:
    pickle.dump(models, f)