In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

ModuleNotFoundError: No module named 'pandas'

In [None]:
# ðŸ”¹ Load the dataset (Karnataka + Tamil Nadu)
df = pd.read_csv('Karnataka_Colleges.csv')  # Make sure both datasets are merged

In [None]:
# ðŸ”¹ Fill any missing values
df.fillna(0, inplace=True)

In [None]:
# ðŸ”¹ Convert categorical XFO and TLS_Version to numeric (label encoding)
df['XFO'] = df['XFO'].astype(str).astype('category').cat.codes
df['TLS_Version'] = df['TLS_Version'].astype(str).astype('category').cat.codes

In [None]:
# ðŸ”¹ Features to consider (use only header-related columns)
features = ['HSTS', 'HSTS_max_age', 'CSP', 'XCTO', 'XFO',
            'Referrer_Policy', 'Perm_Policy', 'TLS_Version',
            'Cookie_HttpOnly', 'Cookie_Secure']

X = df[features]

In [None]:
# Ensure all feature columns are numeric (0/1 or integers)
X = X.apply(pd.to_numeric, errors='coerce').fillna(0).astype(int)

# Now you can calculate the security score safely
df['Security_Score'] = X.sum(axis=1)


In [None]:
# ðŸ”¹ Label: If security score >= 4 â†’ Secure (1), else Not Secure (0)
df['Label'] = df['Security_Score'].apply(lambda x: 1 if x >= 4 else 0)
y = df['Label']

In [None]:
# ðŸ”¹ Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# ðŸ”¹ Train a Random Forest classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
# ðŸ”¹ Predict
y_pred = model.predict(X_test)

In [None]:
# ðŸ”¹ Evaluate
print("\nâœ… Classification Report:\n")
print(classification_report(y_test, y_pred))

In [None]:
print("\nðŸ“Š Confusion Matrix:\n")
print(confusion_matrix(y_test, y_pred))

In [None]:
import matplotlib.pyplot as plt

# ðŸ”¹ Feature importance
importances = model.feature_importances_
feature_names = X.columns

# ðŸ”¹ Bar Plot
plt.figure(figsize=(10, 5))
plt.barh(feature_names, importances)
plt.xlabel("Feature Importance")
plt.title("Which Headers Affect Security Classification Most")
plt.show()
