# URL CLASSIFICATION USING TRADITIONAL ML MODEL


In [1]:
# Install dependencies
!pip install pandas scikit-learn joblib



In [2]:
# Importing the dependencies
import pandas as pd
from io import StringIO

In [3]:
# Upload your CSV file
from google.colab import files
uploaded = files.upload()


Saving product_vs_nonproduct_urls.csv to product_vs_nonproduct_urls.csv


In [4]:
# Load uploaded CSV
file_name = list(uploaded.keys())[0]
df = pd.read_csv(StringIO(uploaded[file_name].decode('utf-8')))
print("CSV Columns:", df.columns.tolist())


CSV Columns: ['url', 'label']


In [5]:
# Auto-detect label column (e.g., 'label', 'type', etc.)
label_column = None
for col in df.columns:
    if df[col].astype(str).str.lower().isin(['product', 'category', 'other']).any():
        label_column = col
        break

In [6]:
# If no label column exists, skip training, only predict
TRAIN_MODEL = label_column is not None
if TRAIN_MODEL:
    print(f"Detected label column: '{label_column}'")
    df['label'] = df[label_column].astype(str).apply(lambda x: 1 if x.strip().lower() == 'product' else 0)
else:
    print("No label column found. Proceeding with prediction only.")

Detected label column: 'label'


In [7]:
# Build input text from title + URL or just URL
if 'title' in df.columns:
    df['text'] = df['title'].fillna('') + ' ' + df['url'].fillna('')
else:
    df['text'] = df['url'].fillna('')


In [8]:
# Optional: clean NaNs and duplicates
df = df[df['url'].notna()]
df = df[df['url'].str.startswith(('http://', 'https://'))]
df = df.drop_duplicates(subset='url')

In [9]:
# Model training
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [10]:
if TRAIN_MODEL:
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1, 2))),
        ('clf', LogisticRegression(max_iter=1000))
    ])

    pipeline.fit(X_train, y_train)

In [12]:
# Evaluation
from sklearn.metrics import classification_report, accuracy_score
y_pred = pipeline.predict(X_test)
print("\n Evaluation Report:\n")
print(classification_report(y_test, y_pred, target_names=['non-product', 'product']))
print(f"Accuracy: {accuracy_score(y_test, y_pred)*100:.2f}%")


 Evaluation Report:

              precision    recall  f1-score   support

 non-product       0.67      1.00      0.80         8
     product       1.00      0.67      0.80        12

    accuracy                           0.80        20
   macro avg       0.83      0.83      0.80        20
weighted avg       0.87      0.80      0.80        20

Accuracy: 80.00%


In [14]:
if TRAIN_MODEL:
    # Save model
    import joblib
    joblib.dump(pipeline, 'product_url_classifier.pkl')
    from google.colab import files
    files.download('product_url_classifier.pkl')
else:
    # Load pretrained model if no labels
    import joblib
    try:
        pipeline = joblib.load('product_url_classifier.pkl')
        print("Loaded existing model for prediction.")
    except:
        raise RuntimeError("No label column and no pre-trained model found.")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [16]:
# Predict on full dataset
df['prediction'] = pipeline.predict(df['text'])
df['predicted_label'] = df['prediction'].apply(lambda x: 'product' if x == 1 else 'non-product')

In [17]:
# Show preview
print("\n Sample Predictions:")
print(df[['url', 'predicted_label']].head(10))


 Sample Predictions:
                                                 url predicted_label
0                https://www.amazon.in/dp/B09G99CW2T         product
1  https://www.amazon.in/Apple-iPhone-13-128GB-Bl...         product
2  https://www.amazon.in/Noise-Launched-Bluetooth...         product
3                https://www.amazon.in/dp/B0C3XGJWQ7         product
4                https://www.amazon.in/dp/B07WFPMKD7         product
5                https://www.amazon.in/dp/B09SH994JW         product
6                https://www.amazon.in/dp/B08444S68M         product
7                https://www.amazon.in/dp/B07DJHXTLJ         product
8                https://www.amazon.in/dp/B07VJYZF59         product
9                https://www.amazon.in/dp/B08444S5JD         product


In [18]:
# Export predictions
df[['url', 'predicted_label']].to_csv('url_predictions.csv', index=False)
print("Predictions saved to url_predictions.csv")
files.download('url_predictions.csv')

Predictions saved to url_predictions.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

RETRAINING ON NEW DATA

In [30]:
# RETRAINING ON NEW DATA
# Install required libraries (skip if already installed)
!pip install -q scikit-learn joblib pandas

In [20]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
import joblib
from google.colab import files
import io

In [21]:
# Upload training CSV (must include 'url' and 'label')
print("Upload TRAINING file (must include 'url' and 'label')")
uploaded = files.upload()
train_file = next(iter(uploaded))
df = pd.read_csv(io.BytesIO(uploaded[train_file]))


Upload TRAINING file (must include 'url' and 'label')


Saving New_product_urls.csv to New_product_urls.csv


In [22]:
# Clean column names (remove whitespace, lowercase)
df.columns = df.columns.str.strip().str.lower()

In [23]:
# Check for mandatory 'url' and 'label' columns
if not {'url', 'label'}.issubset(df.columns):
    raise ValueError("Your TRAINING CSV must contain 'url' and 'label' columns.")

In [24]:
# Check class distribution
print("\nClass Distribution:\n", df['label'].value_counts())


Class Distribution:
 label
other       345
category    333
product     332
Name: count, dtype: int64


In [25]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(df['url'], df['label'], test_size=0.2, random_state=42)

In [26]:
# Define pipeline (TF-IDF + Logistic Regression)
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression(max_iter=1000))
])

In [27]:
# Train model
pipeline.fit(X_train, y_train)

In [28]:
# Evaluate
y_pred = pipeline.predict(X_test)
print("\n Evaluation Report:\n", classification_report(y_test, y_pred))



 Evaluation Report:
               precision    recall  f1-score   support

    category       1.00      1.00      1.00        64
       other       1.00      1.00      1.00        72
     product       1.00      1.00      1.00        66

    accuracy                           1.00       202
   macro avg       1.00      1.00      1.00       202
weighted avg       1.00      1.00      1.00       202



In [29]:
# Save model as .pkl
joblib.dump(pipeline, 'url_classifier.pkl')
files.download('url_classifier.pkl')
print(" Model saved as 'url_classifier.pkl'")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

 Model saved as 'url_classifier.pkl'


BATCH PREDICTION

In [31]:
# BATCH PREDICTION
# Upload BATCH file (with 'url' column only)
print("\n Upload batch prediction CSV (must have 'url' column)")
uploaded_batch = files.upload()
batch_file = next(iter(uploaded_batch))
batch_df = pd.read_csv(io.BytesIO(uploaded_batch[batch_file]))


 Upload batch prediction CSV (must have 'url' column)


Saving product_vs_nonproduct_urls.csv to product_vs_nonproduct_urls (1).csv


In [32]:
# Clean column names (remove whitespace, lowercase) for batch prediction data
batch_df.columns = batch_df.columns.str.strip().str.lower()

In [33]:
# Check for mandatory 'url' column in batch data
if 'url' not in batch_df.columns:
    raise ValueError("Your BATCH prediction CSV must contain a 'url' column.")

In [34]:
# Predict using trained model
batch_df['predicted_label'] = pipeline.predict(batch_df['url'])

In [35]:
# Export results
batch_df.to_csv('batch_predictions.csv', index=False)
files.download('batch_predictions.csv')
print("Predictions saved to 'batch_predictions.csv'")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Predictions saved to 'batch_predictions.csv'
