In [1]:
import numpy as np 
import pandas as pd 
import warnings

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

warnings.simplefilter(action='ignore', category=Warning)

/kaggle/input/resume-dataset-for-resume-ranking-group-10/df_cleaned.csv
/kaggle/input/resume-dataset-for-resume-ranking-group-10/resume_data.csv


In [2]:
df = pd.read_csv("/kaggle/input/resume-dataset-for-resume-ranking-group-10/df_cleaned.csv")

In [3]:
df.head()

Unnamed: 0,matched_score,education_similarity,experience_years,cosine_similarity_skills,highest_degree,ed_req_encoded,exp_req_encoded
0,0.85,0.318784,5.5,0.0,4,4,1
1,0.75,0.375598,5.66,0.0,5,5,5
2,0.416667,0.093495,6.92,0.0,4,5,3
3,0.76,0.0,13.83,0.0,5,4,1
4,0.65,0.312103,17.33,0.0,4,4,4


In [4]:
df.shape

(8560, 7)

In [5]:
desc = pd.DataFrame({
    'feature': df.columns,
    'type': df.dtypes.values,
    'count': df.count().values,
    'nunique': df.nunique().values,
    'null': df.isnull().sum().values
})

print(desc)

                    feature     type  count  nunique  null
0             matched_score  float64   8560      324     0
1      education_similarity  float64   8560     2265     0
2          experience_years  float64   8560      189     0
3  cosine_similarity_skills  float64   8560     2783     0
4            highest_degree    int64   8560        5     0
5            ed_req_encoded    int64   8560        2     0
6           exp_req_encoded    int64   8560        7     0


In [6]:
df.describe()

Unnamed: 0,matched_score,education_similarity,experience_years,cosine_similarity_skills,highest_degree,ed_req_encoded,exp_req_encoded
count,8560.0,8560.0,8560.0,8560.0,8560.0,8560.0,8560.0
mean,0.661067,0.248449,11.84593,0.016344,4.370561,4.428855,3.035631
std,0.166082,0.195666,11.663942,0.037167,0.723529,0.494941,2.910274
min,0.0,0.0,0.08,0.0,-1.0,4.0,0.0
25%,0.583333,0.108173,3.5,0.0,4.0,4.0,1.0
50%,0.683333,0.194314,6.25,0.0,4.0,4.0,3.0
75%,0.793333,0.342125,19.17,0.017013,5.0,5.0,5.0
max,0.95,1.0,62.0,0.415107,6.0,5.0,15.0


# Feature Engineering

In [7]:
# df = df.drop('highest_degree', axis=1)

# Splitting Data

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor  
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

X = df.drop(columns=['matched_score'])
y = df['matched_score']

In [9]:
# Split 80% train, 20% temp
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)

# test-10%, validate-10%
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Hyperparameter Testing and Cross-validation

In [10]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score

In [11]:
model = DecisionTreeRegressor(random_state=42)

param_grid = {
    'max_depth': [3, 5, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2']
}

grid_search = GridSearchCV(estimator=model,
                           param_grid=param_grid,
                           scoring='neg_mean_squared_error',
                           cv=5,
                           n_jobs=-1,
                           verbose=1)

grid_search.fit(X_train, y_train)  

Fitting 5 folds for each of 135 candidates, totalling 675 fits


In [12]:
print("Best parameters found: ", grid_search.best_params_)
best_model = grid_search.best_estimator_

Best parameters found:  {'max_depth': 5, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 10}


In [13]:
from sklearn.metrics import mean_squared_error, r2_score

y_val_pred = best_model.predict(X_val)
y_test_pred = best_model.predict(X_test)

print("Validation MSE:", mean_squared_error(y_val, y_val_pred))
print("Validation R²:", r2_score(y_val, y_val_pred))
print("Test MSE:", mean_squared_error(y_test, y_test_pred))
print("Test R²:", r2_score(y_test, y_test_pred))

Validation MSE: 0.02502211678275724
Validation R²: 0.15353157215952196
Test MSE: 0.022480421400161918
Test R²: 0.14086741754588206


# Model Training & Validation

In [None]:
model = DecisionTreeRegressor(random_state=42)  
model.fit(X_train, y_train)

In [None]:
y_pred_val = model.predict(X_val)

print("Validation MSE:", mean_squared_error(y_val, y_pred_val))
print("Validation R²:", r2_score(y_val, y_pred_val))

In [None]:
y_pred_test = model.predict(X_test)

print("Test MSE:", mean_squared_error(y_test, y_pred_test))
print("Test R²:", r2_score(y_test, y_pred_test))

# Model Evaluation & Interpretation

In [None]:
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

plt.figure(figsize=(20,10))
plot_tree(model, feature_names=X.columns, filled=True, max_depth=3)  # show only top levels
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

importances = model.feature_importances_
feature_names = X.columns

feat_importances = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
}).sort_values(by='importance', ascending=False)

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feat_importances.head(10), palette='viridis')
plt.title('Top 10 Feature Importances')
plt.xlabel('Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

## Results

**Basic model:** <br>
* Validation MSE: 0.03163491953894016
* Validation R²: -0.1291382964425114
* Test MSE: 0.033956491641380565
* Test R²: -0.20027776748477066

**Without ed_req_encoded** <br>
* Validation MSE: 0.037017241361721305
* Validation R²: -0.3212483375760018
* Test MSE: 0.033562646318985215
* Test R²: -0.18635631207378434

**Without highest degree**  <br>
* Validation MSE: 0.031032453953468948
* Validation R²: -0.10763462345207309
* Test MSE: 0.03426508278320597
* Test R²: -0.21118569904282358
  
**Without highest degree and ed_req_encoded:** <br>
* Validation MSE: 0.03609284376311711
* Validation R²: -0.28825401532274997
* Test MSE: 0.03626021762509675
* Test R²: -0.2817087677728529

**Hyperparameter testing and cross-validation** <br>
Best parameters found:  {'max_depth': 5, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 10}
* Validation MSE: 0.02502211678275724
* Validation R²: 0.15353157215952196
* Test MSE: 0.022480421400161918
* Test R²: 0.14086741754588206

**Hyperparameter testing and cross-validation (without highest degree)** <br>
Best parameters found:  {'max_depth': 5, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 5}
* Validation MSE: 0.025059793549272062
* Validation R²: 0.1522570119936174
* Test MSE: 0.02245146293945709
* Test R²: 0.14197412087169492