In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
from urllib.parse import urlparse
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from nltk.stem.snowball import SnowballStemmer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression  # Example model
from sklearn.ensemble import RandomForestClassifier #Another Example Model
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2
from imblearn.over_sampling import SMOTE  # Install: pip install imbalanced-learn
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
# Load and preprocess data
dataset = pd.read_csv('url_classification.csv')
dataset.columns = ['sr_no', 'website_url', 'category']
df = dataset.iloc[:, 1:].dropna()

In [9]:
import tensorflow as tf
print("TensorFlow version:", tf.__version__)


TensorFlow version: 2.18.0


In [10]:
print("GPUs Available: ", tf.config.list_physical_devices('GPU'))


GPUs Available:  []


In [11]:
import tensorflow as tf

# Check TensorFlow version
print("TensorFlow Version:", tf.__version__)

# Check GPU availability
print("GPUs Available:", tf.config.list_physical_devices('GPU'))



TensorFlow Version: 2.18.0
GPUs Available: []


In [12]:
# Force TensorFlow to use GPU
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    try:
        tf.config.experimental.set_memory_growth(physical_devices[0], True)
        print("GPU is now enabled for TensorFlow.")
    except RuntimeError as e:
        print(f"Failed to set memory growth: {e}")

# Run a sample computation
with tf.device('/GPU:0'):
    a = tf.constant([1.0, 2.0, 3.0])
    b = tf.constant([4.0, 5.0, 6.0])
    result = tf.reduce_sum(a * b)
    print("GPU Computation Result:", result.numpy())


GPU Computation Result: 32.0


In [13]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())


[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 2568442150476920164
xla_global_id: -1
]


In [None]:

def preprocess_url(url):
    url = re.sub(r"https?://", "", url)
    url = re.sub(r"www\d?\.", "", url)
    url = re.sub(r"[^\w\s]", " ", url)
    url = re.sub(r"\s+", " ", url).strip()
    return url

df['website_url'] = df['website_url'].apply(preprocess_url)

def extract_features(url):
    parsed = urlparse(url)
    domain = parsed.netloc.replace("www.", "")
    path = parsed.path.replace("/", "")
    query_count = len(parsed.query.split('&')) if parsed.query else 0
    url_length = len(url)
    num_count = sum(c.isdigit() for c in url)
    return f"{domain} {path}", query_count, url_length, num_count

df[["website_url", "query_count", "url_length", "num_count"]] = df['website_url'].apply(lambda x: pd.Series(extract_features(x)))

X = df['website_url']
y = df['category']

KeyboardInterrupt: 

In [None]:

# Split data (stratified)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Feature Engineering Pipeline
stemmer = SnowballStemmer("english")
def stemmed_tokenizer(text):
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    tokens = text.split()
    return [stemmer.stem(token) for token in tokens]

text_pipeline = Pipeline([
    ('vect', TfidfVectorizer(stop_words='english', ngram_range=(1, 3), tokenizer=stemmed_tokenizer)),
    ('tfidf', TfidfTransformer()),
    ('select', SelectKBest(chi2, k=5000))  # Adjust k as needed
])

num_pipeline = Pipeline([
    ('scaler', MinMaxScaler())
])

preprocessor = ColumnTransformer([
    ('text', text_pipeline, 'website_url'),
    ('num', num_pipeline, ['query_count', 'url_length', 'num_count'])
])

X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)


# Apply SMOTE *after* transformation
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_transformed, y_train)

# Model Training and Evaluation (Logistic Regression Example)
pipeline = Pipeline([
    ('clf', LogisticRegression(class_weight='balanced', solver='liblinear', random_state=42)) # solver for l1 penalty
])

# Grid Search (Optional - uncomment to use)
# param_grid = {
#     'clf__C': [0.1, 1, 10],
#     'clf__penalty': ['l1', 'l2']
# }
# grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_weighted', n_jobs = -1)
# grid_search.fit(X_train_resampled, y_train_resampled)
# best_model = grid_search.best_estimator_

pipeline.fit(X_train_resampled, y_train_resampled)
y_pred = pipeline.predict(X_test_transformed)

print("\n✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred, zero_division=1))  # Handle zero division
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# New URL Predictions
new_urls = [
    "https://www.espn.com/nba/scores",
    "http://netflix.com/show/98765",
    "https://github.com/tensorflow/models",
    "https://www.bbc.com/news/politics",
    "renewable-resources.netlify.app"
]

new_urls_processed = [preprocess_url(url) for url in new_urls]
new_df = pd.DataFrame({"website_url": new_urls_processed, "query_count": [0]*len(new_urls), "url_length": [len(url) for url in new_urls], "num_count": [sum(c.isdigit() for c in url) for url in new_urls]})
new_urls_transformed = preprocessor.transform(new_df)
new_predictions = pipeline.predict(new_urls_transformed)

print("\n🔹 Predictions on New URLs:")
for url, category in zip(new_urls, new_predictions):
    print(f"{url} → {category}")


#Random Forest Classifier
pipeline_rf = Pipeline([
    ('clf', RandomForestClassifier(class_weight='balanced', random_state=42))
])

pipeline_rf.fit(X_train_resampled, y_train_resampled)
y_pred_rf = pipeline_rf.predict(X_test_transformed)

print("\n✅ Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\n📊 Random Forest Classification Report:\n", classification_report(y_test, y_pred_rf, zero_division=1))  # Handle zero division
print("\nRandom Forest Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))


new_predictions_rf = pipeline_rf.predict(new_urls_transformed)

print("\n🔹 Random Forest Predictions on New URLs:")
for url, category in zip(new_urls, new_predictions_rf):
    print(f"{url} → {category}")

In [None]:
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler  
from xgboost import XGBClassifier
from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import re
from urllib.parse import urlparse
from nltk.stem.snowball import SnowballStemmer

# Load dataset
dataset = pd.read_csv('url_classification.csv')
dataset.columns = ['sr_no', 'website_url', 'category']
df = dataset.iloc[:, 1:]
df = df.dropna(axis=0)

# Preprocess URLs
def preprocess_url(url):
    url = re.sub(r"https?://", "", url)        # Remove protocols
    url = re.sub(r"www\d?\.", "", url)         # Remove www
    url = re.sub(r"[^\w\s]", " ", url)         # Remove special characters
    url = re.sub(r"\s+", " ", url).strip()     # Remove extra spaces
    return url

df['website_url'] = df['website_url'].apply(preprocess_url)

# Extract features
def extract_features(url):
    parsed = urlparse(url)
    domain = parsed.netloc.replace("www.", "")
    path = parsed.path.replace("/", "")
    query_count = len(parsed.query.split('&')) if parsed.query else 0
    url_length = len(url)
    num_count = sum(c.isdigit() for c in url)
    return f"{domain} {path}", query_count, url_length, num_count

df[["website_url", "query_count", "url_length", "num_count"]] = df['website_url'].apply(lambda x: pd.Series(extract_features(x)))

# Split into training and test data
test_samples_per_class = 2000
test_dfs = []

for category in df["category"].unique():
    test_dfs.append(df[df["category"] == category].sample(test_samples_per_class, random_state=42))

test_data = pd.concat(test_dfs)
train_data = df.drop(test_data.index)

X_train = train_data.drop(columns=['category'])
y_train = train_data['category']
X_test = test_data.drop(columns=['category'])
y_test = test_data['category']

# Compute class weights based on frequency
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = dict(zip(np.unique(y_train), class_weights))

# Resampling (Undersampling majority classes)
rus = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

# Custom tokenizer with SnowballStemmer
stemmer = SnowballStemmer("english")
def stemmed_tokenizer(text):
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)  # Remove special characters
    tokens = text.split()
    return [stemmer.stem(token) for token in tokens]  # Apply stemming

# Define transformation pipeline
text_pipeline = Pipeline([
    ('vect', TfidfVectorizer(stop_words=None, ngram_range=(1, 3), tokenizer=stemmed_tokenizer)),  # Use 1-3 grams
    ('tfidf', TfidfTransformer())
])

num_pipeline = Pipeline([
    ('scaler', MinMaxScaler())
])

preprocessor = ColumnTransformer([
    ('text', text_pipeline, 'website_url'),
    ('num', num_pipeline, ['query_count', 'url_length', 'num_count'])
])

# XGBoost model
xgb_model = XGBClassifier(objective='multi:softmax', num_class=len(np.unique(y_train)), 
                          eval_metric='mlogloss', use_label_encoder=False, 
                          scale_pos_weight=class_weight_dict)

# Final Pipeline
pipeline = Pipeline([
    ('features', preprocessor),
    ('clf', xgb_model)
])

# Train the Model
pipeline.fit(X_train_resampled, y_train_resampled)

# Predictions
y_pred = pipeline.predict(X_test)

# Evaluation
print("\n✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
plt.figure(figsize=(12,6))
conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=np.unique(y_train), yticklabels=np.unique(y_train))
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# 🔹 **Test New URLs**
new_urls = [
    "https://www.espn.com/nba/scores",
    "http://netflix.com/show/98765",
    "https://github.com/tensorflow/models",
    "https://www.bbc.com/news/politics",
    "renewable-resources.netlify.app"
]

new_urls_processed = [preprocess_url(url) for url in new_urls]
predictions = pipeline.predict(pd.DataFrame({
    "website_url": new_urls_processed,
    "query_count": [0]*len(new_urls),
    "url_length": [len(url) for url in new_urls],
    "num_count": [sum(c.isdigit() for c in url) for url in new_urls]
}))

print("\n🔹 Predictions on New URLs:")
for url, category in zip(new_urls, predictions):
    print(f"{url} → {category}")
