In [6]:
# Step 1: Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from datetime import datetime
import numpy as np

In [7]:
# Step 2: Load the dataset
file_path = 'structured_dataset.xlsx'  # Update with the correct path if needed
data = pd.read_excel(file_path)

In [9]:
# Step 3: Preprocess the data
# Drop irrelevant columns
processed_data = data.drop(columns=['Record_ID', 'Name', 'Comments'])

# Handle missing values: Fill numeric columns with mean and categorical with mode
for column in processed_data.columns:
    if processed_data[column].dtype in [np.float64, np.int64]:
        processed_data[column] = processed_data[column].fillna(processed_data[column].mean())
    else:
        processed_data[column] = processed_data[column].fillna(processed_data[column].mode()[0])


# Convert 'Date_Joined' to numerical form (days since a reference date)
reference_date = datetime(2000, 1, 1)
processed_data['Date_Joined'] = (
    pd.to_datetime(processed_data['Date_Joined']) - reference_date
).dt.days

# Encode categorical columns using LabelEncoder
label_encoders = {}
for column in processed_data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    processed_data[column] = le.fit_transform(processed_data[column])
    label_encoders[column] = le


In [3]:
# Step 3: Split dataset into features and target
def split_features_target(data, target_column):
    """
    Split the dataset into features (X) and target (y).
    """
    X = data.drop(target_column, axis=1)
    y = data[target_column]
    return X, y

In [4]:
# Step 4: Train and test the model
def train_and_test_model(X, y):
    """
    Train and test a Random Forest Classifier.
    """
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train the Random Forest model
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)
    
    # Predict on test data
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    return accuracy


In [5]:
# Main Execution
if __name__ == "__main__":
    # Specify the dataset file path
    dataset_path = "structured_dataset.xlsx"  # Update with your dataset's path
    target_column = "TargetColumn"  # Replace with the name of your target column

    # Step 1: Load the dataset
    data = load_dataset(dataset_path)
    if data is not None:
        print("Dataset Preview:\n", data.head())

        # Step 2: Preprocess the data
        print("\nPreprocessing the dataset...")
        data = preprocess_data(data, target_column)
        
        # Step 3: Split features and target
        print("\nSplitting features and target...")
        X, y = split_features_target(data, target_column)

        # Step 4: Train and test the model
        print("\nTraining and testing the model...")
        accuracy = train_and_test_model(X, y)
        print(f"Model Accuracy: {accuracy * 100:.2f}%")
    else:
        print("Dataset loading failed.")

Error loading dataset: name 'pd' is not defined
Dataset loading failed.
