Importing needed libraries and modules

In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from scipy.sparse import hstack
import numpy as np

Loading Data and then treating null values

In [3]:
df = pd.read_csv('resume_data.csv')

In [4]:
threshold = 0.7
missing_percent = df.isnull().mean()
columns_to_keep = missing_percent[missing_percent < threshold].index.tolist()
df = df[columns_to_keep]

In [5]:
if 'career_objective' in df.columns:
    df.drop(columns=['career_objective'], inplace=True)

In [6]:
text_columns = df.select_dtypes(include='object').columns.tolist()
df[text_columns] = df[text_columns].fillna("Unknown")

In [7]:
numeric_columns = df.select_dtypes(include='number').columns.tolist()
df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mean())

Removing Outliers

In [8]:
for col in numeric_columns:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df = df[(df[col] >= lower) & (df[col] <= upper)]

Normalization

In [9]:
scaler = MinMaxScaler()
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])

TF - IDF Vectorization

In [10]:
text_features = ['skills', 'responsibilities', 'skills_required', 'related_skils_in_job']
vectorizers = {col: TfidfVectorizer(max_features=100) for col in text_features}
tfidf_matrices = [vectorizers[col].fit_transform(df[col]) for col in text_features]
X_text = hstack(tfidf_matrices)

Ridge regression

In [11]:
y = df['matched_score'].values

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_text, y, test_size=0.2, random_state=42)

In [13]:
model = Ridge(alpha=1.0)
model.fit(X_train, y_train)

Evaluation

In [14]:
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

In [15]:
print("Model Evaluation:")
print(f"R² Score: {r2:.3f}")
print(f"MAE: {mae:.3f}")
print(f"RMSE: {rmse:.3f}")

Model Evaluation:
R² Score: 0.406
MAE: 0.139
RMSE: 0.175


Testing on random sample input

In [16]:
sample_resume = {
    "skills": "Python, Machine Learning, Data Analysis, Pandas, Scikit-learn",
    "responsibilities": "Developed models for classification and regression tasks.",
    "skills_required": "Machine Learning, Python, Scikit-learn",
    "related_skils_in_job": "ML, Python, Data Science"
}

In [17]:
sample_tfidf = [vectorizers[key].transform([sample_resume[key]]) for key in text_features]
sample_combined = hstack(sample_tfidf)

In [18]:
predicted_score = model.predict(sample_combined)
print(f"Predicted Matched Score: {predicted_score[0]:.3f}")

Predicted Matched Score: 0.530
