In [4]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

# Load dataset
df = pd.read_excel("dummy_npi_data.xlsx", sheet_name="Dataset")

# Feature Engineering
df["Login Hour"] = pd.to_datetime(df["Login Time"]).dt.hour  # Extract login hour
df["Target"] = (df["Count of Survey Attempts"] > 0).astype(int)  # Binary target

# Encode categorical features
categorical_features = ["State", "Region", "Speciality"]
encoders = {col: LabelEncoder() for col in categorical_features}
for col in categorical_features:
    df[col] = encoders[col].fit_transform(df[col])

# Selecting features and target
features = ["State", "Region", "Speciality", "Login Hour", "Usage Time (mins)"]
X = df[features]
y = df["Target"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Compute Training Accuracy
train_accuracy = model.score(X_train, y_train)  # Accuracy on training data

# Compute Test Accuracy
test_accuracy = model.score(X_test, y_test)  # Accuracy on test data

# Print Accuracy Scores
print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

# Save model
joblib.dump(model, "random_forest_npi_model.pkl")

# Save encoders for later use in prediction
joblib.dump(encoders, "label_encoders.pkl")

# Save feature names for consistency in UI
joblib.dump(X_train.columns, "feature_names.pkl")

print("Training complete! Model saved as random_forest_npi_model.pkl")


Training Accuracy: 1.0000
Test Accuracy: 0.8950
Training complete! Model saved as random_forest_npi_model.pkl
