In [None]:
# Cell 1: Import Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from joblib import dump
import time


In [None]:
# Cell 2: Define Preprocessing Function
def preprocess_df(df):
    # Remove any non-numeric columns that cannot be converted directly to float
    non_numeric_columns = df.select_dtypes(include=['object', 'bool']).columns.tolist()
    df = pd.get_dummies(df, columns=non_numeric_columns, drop_first=True)
    
    # Fill any NaN values in numeric columns
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    df[numeric_cols] = df[numeric_cols].apply(lambda x: x.fillna(x.median()))
    
    return df


In [None]:
# Cell 3: Load and Preprocess the Data
print("Loading initial dataset...")
df = pd.read_csv('Merged_Network_dataset.csv')  # Update the path if hosted on Google Drive
df = preprocess_df(df)
print("Dataset loaded and preprocessed.")


In [None]:
# Cell 4: Setup Data for Model Training
y = df['type']
X = df.drop('type', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


In [None]:
# Cell 5: Apply RFE and Identify Important Features
print("Fitting model for RFE...")
rfe_model = RandomForestClassifier(random_state=42)
rfe = RFE(rfe_model, n_features_to_select=15)  # Adjust the number of features as needed
rfe.fit(X_train, y_train)
features_to_keep = X.columns[rfe.support_]
print("Selected features:", features_to_keep)


In [None]:
# Cell 6: Train Final Model Using Selected Features
X_train = X_train[features_to_keep]
X_test = X_test[features_to_keep]
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)

# Evaluate Model
y_pred = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print(f'Testing Accuracy: {accuracy:.2f}')
print('Classification Report:\n', report)


In [None]:
# Cell 7: Save the Model for Future Use
dump(rf_classifier, 'random_forest_model.joblib')
print("Model saved.")
