In [9]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
df = pd.read_csv('Job Prediction By Resume.csv')

In [10]:
df.head()

Unnamed: 0,Name,Gender,Course in UG,UG specialization? Major Subject (Eg; Mathematics),Interests,Skills,Average CGPA or Percentage obtained in under graduation,Certificate course title,Work in the past,First Job title in your current field of work,title job,Match Percentage
0,A.Uha Priya,Female,B.Sc,Computer Applications,Cloud computing,Python;SQL;Java,85.0,"Linux,Git",Yes,Software,Software Developer (Female Leader),58
1,Aadil,Male,B.E,Computer Science Engineering,Technology,"Critical Thinking, Analytic Thinking, SQL, Pro...",66.5,Microsoft certification,Yes,Computer Software Engineer,IT Specialist,57
2,Aakriti,Female,BA,Psychology,Understand human behaviour,People management;Communication skills,64.6,Resilience psychology,No,,Clinical Psychologist,70
3,Aanchal sharma,Female,MBA,Commerce,Sales/Marketing;Trading;Understand human behav...,Accounting Skills;Critical Thinking,75.525,No,Yes,Relationships manager,Marketing Strategist,100
4,Aangkeeta Sarkar,Female,B.Tech,Instrumentation Engineering,Technology,\nPLC Allen Bradley;PLC Ladder Logic;LabVIEW;B...,70.68,Extreme Productivity (Blinkist Summary),Yes,Plant Instrumentation Engineer,IT Specialist,91


In [11]:
df.isna().sum()

Name                                                          0
Gender                                                        0
Course in UG                                                  0
 UG specialization? Major Subject (Eg; Mathematics)           0
Interests                                                     0
Skills                                                       21
Average CGPA or Percentage obtained in under graduation       0
Certificate course title                                     18
Work in the past                                              0
First Job title in your current field of work              4877
title job                                                     0
Match Percentage                                              0
dtype: int64

In [12]:
df['Skills'].unique()

array(['Python;SQL;Java',
       'Critical Thinking, Analytic Thinking, SQL, Programming, Work under Pressure, Logical Skills, Problem Solving skills',
       'People management;Communication skills', ...,
       'SQL, Problem Solving skills',
       'Programming Language skills;Critical Thinking;Analytical Skills;Editing;Data Visualization skills( Power Bi/ Tableau );SQL',
       'AI, Java, SQL, C++, R, Linux'], dtype=object)

In [13]:
mode_value = df['Skills'].dropna().mode()[0]
df['Skills'].fillna(mode_value, inplace=True)

In [14]:
df = df.rename(columns={'First Job title in your current field of work ': 'First Job in the field'})

In [15]:
df.columns

Index(['Name', 'Gender', 'Course in UG',
       ' UG specialization? Major Subject (Eg; Mathematics)', 'Interests',
       'Skills', 'Average CGPA or Percentage obtained in under graduation',
       'Certificate course title', 'Work in the past',
       'First Job in the field', 'title job', 'Match Percentage'],
      dtype='object')

In [16]:
mode_value = df['Certificate course title'].dropna().mode()[0]
df['Certificate course title'].fillna(mode_value, inplace=True)

In [17]:
df.isna().sum()

Name                                                          0
Gender                                                        0
Course in UG                                                  0
 UG specialization? Major Subject (Eg; Mathematics)           0
Interests                                                     0
Skills                                                        0
Average CGPA or Percentage obtained in under graduation       0
Certificate course title                                      0
Work in the past                                              0
First Job in the field                                     4877
title job                                                     0
Match Percentage                                              0
dtype: int64

In [18]:
mode_value = df['First Job in the field'].dropna().mode()[0]
df['First Job in the field'].fillna(mode_value, inplace=True)

In [19]:
df.isna().sum()

Name                                                       0
Gender                                                     0
Course in UG                                               0
 UG specialization? Major Subject (Eg; Mathematics)        0
Interests                                                  0
Skills                                                     0
Average CGPA or Percentage obtained in under graduation    0
Certificate course title                                   0
Work in the past                                           0
First Job in the field                                     0
title job                                                  0
Match Percentage                                           0
dtype: int64

In [20]:
df.head()

Unnamed: 0,Name,Gender,Course in UG,UG specialization? Major Subject (Eg; Mathematics),Interests,Skills,Average CGPA or Percentage obtained in under graduation,Certificate course title,Work in the past,First Job in the field,title job,Match Percentage
0,A.Uha Priya,Female,B.Sc,Computer Applications,Cloud computing,Python;SQL;Java,85.0,"Linux,Git",Yes,Software,Software Developer (Female Leader),58
1,Aadil,Male,B.E,Computer Science Engineering,Technology,"Critical Thinking, Analytic Thinking, SQL, Pro...",66.5,Microsoft certification,Yes,Computer Software Engineer,IT Specialist,57
2,Aakriti,Female,BA,Psychology,Understand human behaviour,People management;Communication skills,64.6,Resilience psychology,No,Student (Unemployed),Clinical Psychologist,70
3,Aanchal sharma,Female,MBA,Commerce,Sales/Marketing;Trading;Understand human behav...,Accounting Skills;Critical Thinking,75.525,No,Yes,Relationships manager,Marketing Strategist,100
4,Aangkeeta Sarkar,Female,B.Tech,Instrumentation Engineering,Technology,\nPLC Allen Bradley;PLC Ladder Logic;LabVIEW;B...,70.68,Extreme Productivity (Blinkist Summary),Yes,Plant Instrumentation Engineer,IT Specialist,91


In [21]:
# --- 1. Data Preparation (Silent Mode) ---
num_additional_rows = 160
if df.shape[0] < num_additional_rows + 10:
    for _ in range(num_additional_rows):
        base_row_index = np.random.randint(0, len(df['Name']))
        df_row = {k: df[k][base_row_index] if isinstance(df[k], list) else df[k] for k in df.keys()}
        df_row['Average CGPA or Percentage obtained in under graduation'] = np.random.uniform(6.0, 9.9)
        for text_col in ['Interests', 'Skills', 'Work in the past']:
            if np.random.rand() < 0.1:
                df_row[text_col] = np.nan
            else:
                if isinstance(df[text_col], list):
                    df_row[text_col] = np.random.choice([x for x in df[text_col] if pd.notna(x)])
        df = pd.concat([df, pd.DataFrame([df_row])], ignore_index=True)

# --- 2. Feature Engineering ---
target_column_name = 'title job'
X = df.drop(columns=[target_column_name, 'Name'], errors='ignore').copy()
y = df[target_column_name]

# --- 3. Feature Definitions ---
categorical_features = ['Gender', 'Course in UG', ' UG specialization? Major Subject (Eg; Mathematics)', 
                       'Certificate course title']
numerical_features = ['Average CGPA or Percentage obtained in under graduation']
text_features = ['Interests', 'Skills', 'Work in the past']

# --- 4. Preprocessing Pipeline ---
def combine_text_columns(df_input):
    combined = df_input[text_features].fillna('').astype(str).apply(' '.join, axis=1)
    return combined.str.strip()

text_transformer = Pipeline([
    ('combiner', FunctionTransformer(combine_text_columns, validate=False)),
    ('tfidf', TfidfVectorizer(max_features=500, stop_words='english', ngram_range=(1, 2)))
])

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features),
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]), numerical_features),
    ('text', text_transformer, text_features)
], remainder='drop')

# --- 5. Model Pipeline ---
classes = np.unique(y)
weights = compute_class_weight('balanced', classes=classes, y=y)
class_weights = dict(zip(classes, weights))

model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(
        n_estimators=100,
        max_depth=5,
        min_samples_split=5,
        class_weight=class_weights,
        random_state=42
    ))
])

# --- 6. Evaluation ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y)

model_pipeline.fit(X_train, y_train)
y_pred = model_pipeline.predict(X_test)

# --- FINAL OUTPUT --- ONLY ACCURACY ---
print(f"Model Accuracy: {accuracy_score(y_test, y_pred):.4f}")


Model Accuracy: 0.7497


In [22]:
def get_job_prediction(model):
    """Collect user input and return job prediction"""
    print("\n=== Enter Candidate Details ===")
    
    input_data = {
        'Gender': [input("Gender (Male/Female/Other): ").strip()],
        'Course in UG': [input("Undergraduate Degree (e.g. B.E, B.Tech): ").strip()],
        ' UG specialization? Major Subject (Eg; Mathematics)': [input("Major/Specialization: ").strip()],
        'Certificate course title': [input("Certifications (if any): ").strip()],
        'Average CGPA or Percentage obtained in under graduation': [
            float(input("CGPA/Percentage (6.0-10.0 or 60-100): "))
        ],
        'Interests': [input("Professional Interests: ").strip()],
        'Skills': [input("Skills (comma separated): ").strip()],
        'Work in the past': [input("Work Experience: ").strip()]
    }

    input_df = pd.DataFrame(input_data)
    prediction = model.predict(input_df)[0]
    probability = model.predict_proba(input_df)[0].max()

    print(f"\nPredicted Job: {prediction} (Confidence: {probability:.2f})")
    return prediction

# === Main Loop ===
if __name__ == "__main__":
    print("\n--- Job Prediction System ---")
    while True:
        _ = get_job_prediction(model_pipeline)
        another = input("\nMake another prediction? (y/n): ").strip().lower()
        if another != 'y':
            print("\nThank you for using the Job Prediction System!")
            break



--- Job Prediction System ---

=== Enter Candidate Details ===


Gender (Male/Female/Other):  Male
Undergraduate Degree (e.g. B.E, B.Tech):  B.Tech
Major/Specialization:  Maths
Certifications (if any):  No
CGPA/Percentage (6.0-10.0 or 60-100):  68.76
Professional Interests:  Technology
Skills (comma separated):  PLC Allen Bradley;PLC Ladder Logic;LabVIEW
Work Experience:  Yes



Predicted Job: IT Specialist (Confidence: 0.13)



Make another prediction? (y/n):  n



Thank you for using the Job Prediction System!
