In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
import numpy as np
import joblib

# ✅ Load the data
file_path = 'Lead Scoring.csv'
data = pd.read_csv(file_path)

# ✅ Save Lead Number for reference
lead_numbers = data['Lead Number']

# ✅ Drop unnecessary columns
data_cleaned = data.drop(['Prospect ID', 'Lead Number'], axis=1)

# ✅ Fill missing values using .loc (Future-proof for pandas 3.0)
for col in data_cleaned.columns:
    if data_cleaned[col].dtype == 'object':
        data_cleaned.loc[:, col] = data_cleaned[col].fillna('Unknown')
    else:
        data_cleaned.loc[:, col] = data_cleaned[col].fillna(data_cleaned[col].mean())

# ✅ Encode categorical columns using Label Encoding
label_encoder = LabelEncoder()
for col in data_cleaned.columns:
    if data_cleaned[col].dtype == 'object':
        data_cleaned.loc[:, col] = label_encoder.fit_transform(data_cleaned[col])

# ✅ Split data into features and target
X = data_cleaned.drop('Converted', axis=1)
y = data_cleaned['Converted']

# ✅ Scale numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ✅ Split into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# ✅ Train a RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# ✅ Predictions
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

# ✅ Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)
report = classification_report(y_test, y_pred)

print(f"\n✅ Accuracy: {accuracy * 100:.2f}%")
print(f"✅ ROC-AUC Score: {roc_auc:.2f}")
print("\n✅ Classification Report:\n", report)

# ✅ Extract Feature Importance
feature_importance = model.feature_importances_
features = X.columns

importance_df = pd.DataFrame({
    'Feature': features,
    'Importance': feature_importance
}).sort_values(by='Importance', ascending=False)

# ✅ Normalize importance to get feature weights
importance_df['Weight'] = (importance_df['Importance'] / importance_df['Importance'].sum()) * 100

# ✅ Define the Scoring Function
def calculate_score(probability, feature_values):
    base_score = probability * 100
    feature_contribution = np.dot(feature_values, importance_df['Weight'].values)
    final_score = min(100, max(0, base_score + feature_contribution))
    return final_score

# ✅ Generate Scores for Test Set
test_probabilities = model.predict_proba(X_test)[:, 1]
test_features = X_test
scores = [calculate_score(prob, features) for prob, features in zip(test_probabilities, test_features)]

# ✅ Attach Scores and Lead Number to Results
results = pd.DataFrame(X_test, columns=features)
results['Lead Number'] = lead_numbers.iloc[y_test.index].values
results['Probability'] = test_probabilities
results['Lead_Score'] = scores

# ✅ Define Lead Quality
def lead_quality(score):
    if score >= 70:
        return 'High'
    elif score >= 40:
        return 'Medium'
    else:
        return 'Low'

results['Lead_Quality'] = results['Lead_Score'].apply(lead_quality)

# ✅ Print Results in Terminal (Without Predicted Conversion)
print("\n🔍 LEAD SCORING RESULTS:")
print(results[['Lead Number', 'Probability', 'Lead_Score', 'Lead_Quality']].to_string(index=False))

# ✅ Print Top 5 Important Features
print("\n📌 TOP 5 IMPORTANT FEATURES:")
print(importance_df.head().to_string(index=False))

# ✅ Save the Model and Encoders
joblib.dump(model, 'lead_scoring_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(label_encoder, 'label_encoder.pkl')

# ✅ Save Full Results to CSV (Optional)
results.to_csv('lead_scoring_results.csv', index=False)

print("\n✅ Lead scoring results saved to 'lead_scoring_results.csv'")



✅ Accuracy: 93.45%
✅ ROC-AUC Score: 0.98

✅ Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.96      0.95      1107
           1       0.94      0.89      0.92       741

    accuracy                           0.93      1848
   macro avg       0.94      0.93      0.93      1848
weighted avg       0.93      0.93      0.93      1848


🔍 LEAD SCORING RESULTS:
 Lead Number  Probability  Lead_Score Lead_Quality
      615582     0.300000   19.984386          Low
      588939     0.120000   49.291836       Medium
      621242     0.000000   23.367918          Low
      589803     0.140000    0.431538          Low
      651441     0.000000    0.000000          Low
      650484     0.540000   13.002397          Low
      657038     0.980000  100.000000         High
      649329     0.170000    0.000000          Low
      587559     0.030000    0.000000          Low
      643897     1.000000  100.000000         High
      588006     0