In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('UseFul_Dataset.csv')
df.head()

Unnamed: 0,Position,Job Skills,Experience_job,Job Location,Name,Skills,Experience_candidate,Location,Final Score
0,Software Engineer,"Python, Java, React, SQL, Git",0-2 years,"New York, USA",John Smith,"Python, Java, React, SQL, Git",2 years,"Boston, USA",92.0
1,Software Engineer,"Python, Java, React, SQL, Git",0-2 years,"New York, USA",Emily Johnson,"Python, C++, JavaScript, AWS, Git",1 years,"Los Angeles, USA",54.560971
2,Software Engineer,"Python, Java, React, SQL, Git",0-2 years,"New York, USA",William Carter,"C++, Kubernetes, Docker, Microservices",0 years,"Dallas, USA",40.0
3,Software Engineer,"Python, Java, React, SQL, Git",0-2 years,"New York, USA",Marco Romano,"JavaScript, Node.js, GraphQL, TypeScript",5 years,"Milan, Italy",35.0
4,Software Engineer,"Python, Java, React, SQL, Git",0-2 years,"New York, USA",Amelia Collins,"Swift, Kotlin, Mobile App Development",1 years,"Melbourne, Australia",30.0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128 entries, 0 to 127
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Position              128 non-null    object 
 1   Job Skills            128 non-null    object 
 2   Experience_job        128 non-null    object 
 3   Job Location          128 non-null    object 
 4   Name                  128 non-null    object 
 5   Skills                128 non-null    object 
 6   Experience_candidate  128 non-null    object 
 7   Location              128 non-null    object 
 8   Final Score           128 non-null    float64
dtypes: float64(1), object(8)
memory usage: 9.1+ KB


# we will make

job skills and candidate's skills as vectors and will compare their similarities and based on that we will give score and make that score into new feature!

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
vectorizer = TfidfVectorizer()

In [None]:
# Fit & transform both job and candidate skills
all_skills = df["Job Skills"].tolist() + df["Skills"].tolist()  # Combine both for a common vocabulary
tfidf_matrix = vectorizer.fit_transform(all_skills)

In [None]:
joblib.dump(vectorizer, "vectorizer.pkl")
print("Vectorizer saved successfully!")

Vectorizer saved successfully!


In [None]:
import pickle

In [None]:
with open("vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

In [None]:
# Split back into job and candidate vectors
job_skills_vectors = tfidf_matrix[:len(df), :]  # First half = job skills
candidate_skills_vectors = tfidf_matrix[len(df):, :]  # Second half = candidate skills

In [None]:
df["Skill Match Score"] = [
    cosine_similarity(job_skills_vectors[i], candidate_skills_vectors[i])[0][0]
    for i in range(len(df))
]

In [None]:
df['Skill Match Score']

Unnamed: 0,Skill Match Score
0,1.000000
1,0.351861
2,0.000000
3,0.000000
4,0.000000
...,...
123,0.000000
124,0.305154
125,0.132401
126,0.000000


now we will extract country and check if both job and candidate are in same country or not! if they are in same country we will give 1 otherwise 0


In [None]:
def extract_country(location):
  return location.split(',')[-1].strip()

In [None]:
df['Job Country'] = df['Job Location'].apply(extract_country)

In [None]:
df['Job Country']

Unnamed: 0,Job Country
0,USA
1,USA
2,USA
3,USA
4,USA
...,...
123,USA
124,USA
125,Indonesia
126,Indonesia


In [None]:
df['Candidate_Country'] = df['Location'].apply(extract_country)

In [None]:
df['Same Country'] = (df['Job Country'] == df['Candidate_Country']).astype(int)

In [None]:
df['Same Country']

Unnamed: 0,Same Country
0,1
1,1
2,1
3,0
4,0
...,...
123,0
124,0
125,0
126,0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128 entries, 0 to 127
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Position              128 non-null    object 
 1   Job Skills            128 non-null    object 
 2   Experience_job        128 non-null    object 
 3   Job Location          128 non-null    object 
 4   Name                  128 non-null    object 
 5   Skills                128 non-null    object 
 6   Experience_candidate  128 non-null    object 
 7   Location              128 non-null    object 
 8   Final Score           128 non-null    float64
 9   Skill Match Score     128 non-null    float64
 10  Job Country           128 non-null    object 
 11  Candidate_Country     128 non-null    object 
 12  Same Country          128 non-null    int64  
dtypes: float64(2), int64(1), object(10)
memory usage: 13.1+ KB


now we will check if candidate's experience matches with the job's experience and make a new column based on that!

In [None]:
import re

In [None]:
def extract_years(exp):
  match = re.search(r'\d+',str(exp))
  return int(match.group()) if match else 0

In [None]:
def extract_experience_range(exp_range):
  years = re.findall(r'\d+', str(exp_range))
  if len(years) == 2:
    return (int(years[0]), int(years[1]))
  else:
    return (int(years[0]), int(years[0]))

In [None]:
df['Experience_Min'], df['Experience_Max'] = zip(*df['Experience_job'].apply(extract_experience_range))
df['Experience_candidate'] = df['Experience_candidate'].apply(extract_years)

In [None]:
df['Experience_candidate']

Unnamed: 0,Experience_candidate
0,2
1,1
2,0
3,5
4,1
...,...
123,3
124,5
125,1
126,2


In [None]:
def classify_experience(min_exp, max_exp, candidate_exp):

    if candidate_exp < min_exp:
        return 0  # Candidate has less experience
    elif min_exp <= candidate_exp <= max_exp:
        return 1  # Candidate is within the experience range
    else:
        return 2  # Candidate has more experience than required

In [None]:
df['Experience Match'] = df.apply(lambda x: classify_experience(x['Experience_Min'], x['Experience_Max'], x['Experience_candidate']), axis=1)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128 entries, 0 to 127
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Position              128 non-null    object 
 1   Job Skills            128 non-null    object 
 2   Experience_job        128 non-null    object 
 3   Job Location          128 non-null    object 
 4   Name                  128 non-null    object 
 5   Skills                128 non-null    object 
 6   Experience_candidate  128 non-null    int64  
 7   Location              128 non-null    object 
 8   Final Score           128 non-null    float64
 9   Skill Match Score     128 non-null    float64
 10  Job Country           128 non-null    object 
 11  Candidate_Country     128 non-null    object 
 12  Same Country          128 non-null    int64  
 13  Experience_Min        128 non-null    int64  
 14  Experience_Max        128 non-null    int64  
 15  Experience Match      1

In [None]:
df['Experience Match']

Unnamed: 0,Experience Match
0,1
1,1
2,1
3,2
4,1
...,...
123,2
124,2
125,1
126,1


In [None]:
df.head()

Unnamed: 0,Position,Job Skills,Experience_job,Job Location,Name,Skills,Experience_candidate,Location,Final Score,Skill Match Score,Job Country,Candidate_Country,Same Country,Experience_Min,Experience_Max,Experience Match
0,Software Engineer,"Python, Java, React, SQL, Git",0-2 years,"New York, USA",John Smith,"Python, Java, React, SQL, Git",2,"Boston, USA",92.0,1.0,USA,USA,1,0,2,1
1,Software Engineer,"Python, Java, React, SQL, Git",0-2 years,"New York, USA",Emily Johnson,"Python, C++, JavaScript, AWS, Git",1,"Los Angeles, USA",54.560971,0.351861,USA,USA,1,0,2,1
2,Software Engineer,"Python, Java, React, SQL, Git",0-2 years,"New York, USA",William Carter,"C++, Kubernetes, Docker, Microservices",0,"Dallas, USA",40.0,0.0,USA,USA,1,0,2,1
3,Software Engineer,"Python, Java, React, SQL, Git",0-2 years,"New York, USA",Marco Romano,"JavaScript, Node.js, GraphQL, TypeScript",5,"Milan, Italy",35.0,0.0,USA,Italy,0,0,2,2
4,Software Engineer,"Python, Java, React, SQL, Git",0-2 years,"New York, USA",Amelia Collins,"Swift, Kotlin, Mobile App Development",1,"Melbourne, Australia",30.0,0.0,USA,Australia,0,0,2,1


In [None]:
# prompt: now drop some columns in the new df; the names of the columns to be dropped: Experience_job , Experience_candidate, Job Location, Job Country	, Candidate_Country, Experience_Min, Experience_Max, Location

cols_to_drop = ['Experience_job', 'Experience_candidate', 'Job Location', 'Job Country', 'Candidate_Country', 'Experience_Min', 'Experience_Max', 'Location']
new_df = df.drop(columns=cols_to_drop, errors='ignore')
new_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128 entries, 0 to 127
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Position           128 non-null    object 
 1   Job Skills         128 non-null    object 
 2   Name               128 non-null    object 
 3   Skills             128 non-null    object 
 4   Final Score        128 non-null    float64
 5   Skill Match Score  128 non-null    float64
 6   Same Country       128 non-null    int64  
 7   Experience Match   128 non-null    int64  
dtypes: float64(2), int64(2), object(4)
memory usage: 8.1+ KB


# Training

now we will train the model!

In [None]:
X = new_df.drop(columns=["Position", "Job Skills", "Name", "Skills", "Final Score"])
y = new_df['Final Score']

In [None]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128 entries, 0 to 127
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Skill Match Score  128 non-null    float64
 1   Same Country       128 non-null    int64  
 2   Experience Match   128 non-null    int64  
dtypes: float64(1), int64(2)
memory usage: 3.1 KB


In [None]:
X.rename(columns={"Skill Match Score": "skill_match_score", "Same Country": "is_same_country", "Experience Match" : "experience_match"}, inplace=True)

In [None]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128 entries, 0 to 127
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   skill_match_score  128 non-null    float64
 1   is_same_country    128 non-null    int64  
 2   experience_match   128 non-null    int64  
dtypes: float64(1), int64(2)
memory usage: 3.1 KB


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

In [None]:
model = RandomForestRegressor(n_estimators=100, random_state=42)

In [None]:
model.fit(X_train, y_train)

In [None]:
y_test_pred = model.predict(X_test)
print("Test MAE:", mean_absolute_error(y_test, y_test_pred))
print("Test R² Score:", r2_score(y_test, y_test_pred))

Test MAE: 2.7184294451562065
Test R² Score: 0.9553281973068717


# Saving the model now!

In [None]:
import joblib

In [None]:
joblib.dump(model, "job_match_model.pkl")

['job_match_model.pkl']

In [None]:
with open("job_match_model.pkl", "wb") as f:
    pickle.dump(model, f)

In [None]:
print("Model trained on:", model.feature_names_in_)

Model trained on: ['skill_match_score' 'is_same_country' 'experience_match']
