# ML-Driven Search Ranking Engine

In [1]:

!pip install pandas lightgbm scikit-learn

import pandas as pd
import numpy as np
from lightgbm import LGBMRanker
from sklearn.model_selection import KFold
import urllib.request
import os


def download_data(url, filename):
    if not os.path.exists(filename):
        print(f"Downloading {filename}...")
        urllib.request.urlretrieve(url, filename)
        print("Download complete.")

def load_svmlight_file(file_path):

    data = []
    qids = []

    with open(file_path, 'r') as f:
        for line in f:
            parts = line.strip().split(' ')
            relevance_score = int(parts[0])


            query_id_str = parts[1].split(':')[1]
            query_id = int(float(query_id_str))



            features = {int(p.split(':')[0]): float(p.split(':')[1]) for p in parts[2:]}

            data.append((query_id, relevance_score, features))


    df_data = []
    for qid, rel_score, feats in data:
        row = {'QueryId': qid, 'RelScore': rel_score}
        row.update(feats)
        df_data.append(row)

    df = pd.DataFrame(df_data)
    df = df.fillna(0)


    df = df.sort_values(by='QueryId').reset_index(drop=True)

    return df


data_url = "https://raw.githubusercontent.com/microsoft/LightGBM/master/examples/lambdarank/rank.train"
file_name = "mq2008_sample.txt"
download_data(data_url, file_name)

df = load_svmlight_file(file_name)


X = df.drop(['QueryId', 'RelScore'], axis=1)
y = df['RelScore']
qids = df['QueryId']
groups = df.groupby('QueryId').size().to_numpy()

print(f"Dataset loaded with {len(df)} documents across {len(groups)} queries.")
print(f"Number of features: {X.shape[1]}")


k_fold = KFold(n_splits=3, shuffle=True, random_state=42)
models = []

print("\nStarting 3-fold cross-validation...")

for train_index, test_index in k_fold.split(X, y, groups=qids):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]


    groups_train = X_train.groupby(qids.iloc[train_index]).size().to_numpy()

    ranker = LGBMRanker(
        objective="lambdarank",
        metric="ndcg",
        n_estimators=50,
        learning_rate=0.1
    )

    ranker.fit(X_train, y_train, group=groups_train)
    models.append(ranker)

print("\nModel training complete for all folds.")


print("\nMaking predictions on a new, hypothetical query...")


num_features = X.shape[1]
new_query_data = pd.DataFrame(np.random.rand(3, num_features), columns=X.columns)


predictions = models[0].predict(new_query_data)
new_query_data['PredictedScore'] = predictions
ranked_results = new_query_data.sort_values(by='PredictedScore', ascending=False)

print("\nPredicted rankings for the new query:")
print(ranked_results[['PredictedScore']])

Downloading mq2008_sample.txt...
Download complete.
Dataset loaded with 3005 documents across 2 queries.
Number of features: 217

Starting 3-fold cross-validation...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004784 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6155
[LightGBM] [Info] Number of data points in the train set: 2003, number of used features: 214




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004108 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6142
[LightGBM] [Info] Number of data points in the train set: 2003, number of used features: 213
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004031 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6151
[LightGBM] [Info] Number of data points in the train set: 2004, number of used features: 212

Model training complete for all folds.

Making predictions on a new, hypothetical query...

Predicted rankings for the new query:
   PredictedScore
0       -3.820940
2       -5.000245
1       -5.146782
