In [2]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score


In [3]:
data_path = Path("/home/ae25872/codebase/proai/Common-Crawl---Autumn-2025/analysis_results_JSON_LLM.csv")
df_raw = pd.read_csv(data_path)

bool_cols = [
    "string_match_result",
    "Key_ID_match",
    "llm_found_embedded_link",
    "llm_parse_success",
]
numeric_cols = ["scraped_result_position"]
text_cols = ["scraped_result_url", "company_name"]
target_col = "llm_is_official_website"

# Drop rows without a target label
df = df_raw[df_raw[target_col].notna()].copy()

y = df[target_col].astype(int)

for col in bool_cols:
    df[col] = df[col].fillna(False).astype(int)

for col in numeric_cols:
    df[col] = df[col].fillna(df[col].median())

for col in text_cols:
    df[col] = df[col].fillna("")

feature_cols = numeric_cols + bool_cols + text_cols
X = df[feature_cols]

display(df[feature_cols + [target_col]].head())


Unnamed: 0,scraped_result_position,string_match_result,Key_ID_match,llm_found_embedded_link,llm_parse_success,scraped_result_url,company_name,llm_is_official_website
0,1,0,0,0,1,https://mockingbirdcinema.com/MockingbirdCinem...,MOCKINGBIRD ENTERTAINMENT LIMITED,True
1,3,0,0,0,1,https://www.mockingbirdfilmcompany.com/,MOCKINGBIRD ENTERTAINMENT LIMITED,True
2,1,1,0,0,1,https://www.pentasia.com/,PENTASIA LIMITED,False
3,2,1,0,0,1,https://www.pentasia.com/cm/contact-us,PENTASIA LIMITED,False
4,3,1,0,0,1,https://www.pentasia.com/cm/about-us,PENTASIA LIMITED,True


In [5]:
preprocess = ColumnTransformer(
    transformers=[
        (
            "url_tfidf",
            TfidfVectorizer(ngram_range=(1, 2), min_df=1, lowercase=True),
            "scraped_result_url",
        ),
        (
            "company_tfidf",
            TfidfVectorizer(ngram_range=(1, 2), min_df=1, lowercase=True, stop_words="english"),
            "company_name",
        ),
        (
            "numeric",
            "passthrough",
            numeric_cols + bool_cols,
        ),
    ],
    remainder="drop",
)

model = Pipeline(
    steps=[
        ("preprocess", preprocess),
        (
            "to_dense",
            FunctionTransformer(lambda x: x.toarray() if hasattr(x, "toarray") else x, accept_sparse=True),
        ),
        (
            "classifier",
            MLPClassifier(
                hidden_layer_sizes=(64,),
                activation="relu",
                solver="adam",
                random_state=42,
                max_iter=150,
                early_stopping=True,
                n_iter_no_change=10,
                verbose=True,
            ),
        ),
    ]
)

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.25,
    random_state=42,
    stratify=y,
)

model.fit(X_train, y_train)


Iteration 1, loss = 0.69912867
Validation score: 0.600000
Iteration 2, loss = 0.68863716
Validation score: 0.600000
Iteration 3, loss = 0.67855057
Validation score: 0.500000
Iteration 4, loss = 0.66881790
Validation score: 0.500000
Iteration 5, loss = 0.65934118
Validation score: 0.500000
Iteration 6, loss = 0.64990531
Validation score: 0.500000
Iteration 7, loss = 0.64053589
Validation score: 0.400000
Iteration 8, loss = 0.63122254
Validation score: 0.400000
Iteration 9, loss = 0.62190230
Validation score: 0.500000
Iteration 10, loss = 0.61256495
Validation score: 0.500000
Iteration 11, loss = 0.60319532
Validation score: 0.500000
Iteration 12, loss = 0.59378650
Validation score: 0.600000
Validation score did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.


0,1,2
,steps,"[('preprocess', ...), ('to_dense', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('url_tfidf', ...), ('company_tfidf', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,func,<function <la...x7ac9e95deb60>
,inverse_func,
,validate,False
,accept_sparse,True
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,hidden_layer_sizes,"(64,)"
,activation,'relu'
,solver,'adam'
,alpha,0.0001
,batch_size,'auto'
,learning_rate,'constant'
,learning_rate_init,0.001
,power_t,0.5
,max_iter,150
,shuffle,True


In [6]:
y_pred = model.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred):.3f}")
print(classification_report(y_test, y_pred))


Accuracy: 0.613
              precision    recall  f1-score   support

           0       0.80      0.27      0.40        15
           1       0.58      0.94      0.71        16

    accuracy                           0.61        31
   macro avg       0.69      0.60      0.56        31
weighted avg       0.68      0.61      0.56        31



In [7]:
probas = model.predict_proba(X)[:, 1]
preds = model.predict(X)

results = df.copy()
results["pred_score"] = probas
results["pred_label"] = preds

best_per_company = (
    results.sort_values(["company_number", "pred_score"], ascending=[True, False])
    .groupby("company_number")
    .first()
    .reset_index()[
        [
            "company_number",
            "company_name",
            "scraped_result_url",
            "pred_score",
            "pred_label",
            "llm_official_url",
        ]
    ]
)

output_path = Path("/home/ae25872/codebase/proai/Common-Crawl---Autumn-2025/predicted_official_websites.csv")
best_per_company.to_csv(output_path, index=False)

print(f"Saved per-company website predictions to {output_path}")
best_per_company.head(10)


Saved per-company website predictions to /home/ae25872/codebase/proai/Common-Crawl---Autumn-2025/predicted_official_websites.csv


Unnamed: 0,company_number,company_name,scraped_result_url,pred_score,pred_label,llm_official_url
0,675001,ABB CABLE MANAGEMENT PRODUCTS LIMITED,https://www.legalentityidentifier.co.uk/leicer...,0.47076,0,
1,1018080,C SPARKS & SONS LIMITED,https://sparkstransport.com/,0.542785,1,https://www.sparks-transport.co.uk/
2,1698730,MOUNTCURZON SECURITIES LIMITED,https://open.endole.co.uk/insight/company/0169...,0.490505,0,
3,1943761,HYDRAGOLD LIMITED,https://open.endole.co.uk/insight/company/0194...,0.527588,1,http://www.amhursthotel.co.uk
4,2781890,HOLDERS COMPONENTS LIMITED,https://holderstechnology.com/,0.553193,1,https://holderstechnology.com/
5,2865241,CROFT BUILDING LIMITED,https://open.endole.co.uk/insight/company/0286...,0.537169,1,http://www.croftbc.com
6,2872800,BESTSTREAM LIMITED,https://open.endole.co.uk/insight/company/0287...,0.508731,1,
7,3136191,HOT WATER SALES LIMITED,http://www.hbs.jabunyan.co.uk/,0.528174,1,
8,3286370,PEDAL EXPRESS LTD,https://www.companywall.co.uk/business/pedal-e...,0.481185,0,
9,3664260,FENGARI LIMITED,https://open.endole.co.uk/insight/company/0366...,0.52467,1,
