In [9]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE

# Load dataset
df = pd.read_csv("/content/dhan_setu_modified_updated (1).csv")

# Drop duplicates and handle missing values
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)

# Merge overlapping classes (2 -> 3 and 5 -> 6)
df["Investment Recommendation"] = df["Investment Recommendation"].replace({2: 3, 5: 6})

# Encode categorical features
label_encoders = {}
categorical_cols = ["Occupation", "Risk Tolerance", "Investment Duration", "Financial Goals", "Liquidity Needs", "Investment Recommendation"]

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Store encoder for later use

# Define features and target
X = df.drop(columns=["Investment Recommendation"])
y = df["Investment Recommendation"]

# Scale numerical features
scaler = StandardScaler()
X[X.select_dtypes(include=["int64", "float64"]).columns] = scaler.fit_transform(X.select_dtypes(include=["int64", "float64"]))

# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X, y = smote.fit_resample(X, y)

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train an XGBoost Classifier
xgb = XGBClassifier(n_estimators=160, max_depth=10, learning_rate=0.1, subsample=1, random_state=42, eval_metric="mlogloss")
xgb.fit(X_train, y_train)

# Make predictions
y_pred = xgb.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Updated Model Accuracy: {accuracy:.2f}")

# Display classification report and confusion matrix
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Save the trained model and encoders
joblib.dump(xgb, "investment_classifier_updated.pkl")
joblib.dump(label_encoders, "label_encoders_updated.pkl")
joblib.dump(scaler, "scaler_updated.pkl")

print("Updated Model and encoders saved successfully!")


Updated Model Accuracy: 0.78

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       403
           1       0.56      0.54      0.55       403
           2       0.56      0.58      0.57       403
           3       1.00      1.00      1.00       403
           4       1.00      1.00      1.00       403
           5       0.69      0.61      0.65       403
           6       0.65      0.72      0.68       403

    accuracy                           0.78      2821
   macro avg       0.78      0.78      0.78      2821
weighted avg       0.78      0.78      0.78      2821


Confusion Matrix:
 [[403   0   0   0   0   0   0]
 [  0 217 186   0   0   0   0]
 [  0 170 233   0   0   0   0]
 [  0   0   0 403   0   0   0]
 [  0   0   0   0 403   0   0]
 [  0   0   0   0   0 247 156]
 [  0   0   0   0   0 113 290]]
Updated Model and encoders saved successfully!


In [34]:
import joblib
import pandas as pd

# Load the trained model and encoders
model = joblib.load("investment_classifier_updated.pkl")
label_encoders = joblib.load("label_encoders_updated.pkl")
scaler = joblib.load("scaler_updated.pkl")

# Function to encode categorical inputs
def encode_input(column_name, value):
    if column_name in label_encoders:
        # Handle unseen labels by returning a default value or raising a warning
        try:
            return label_encoders[column_name].transform([value])[0]
        except ValueError:
            print(f"Warning: Unseen label '{value}' encountered for column '{column_name}'. Returning -1.")
            return -1  # Or any other appropriate default value
    return value

# User inputs (Replace these values with real inputs)#23,Teacher,Medium,Long-term,Wealth Building,500000,High
user_data = {
    "Age": 38,
    "Occupation": "Doctor",  # Categorical
    "Risk Tolerance": "High",  # Categorical
    "Investment Duration": "Mid-Term",  # Categorical
    "Financial Goals": "Wealth Building",  # Categorical
    "Investment Amount": 600000,
    "Liquidity Needs": "High",  # Categorical
    "Annual Income":1800000
}

# Convert inputs into a DataFrame
input_df = pd.DataFrame([user_data])

# Encode categorical inputs
for col in input_df.columns:
    input_df[col] = encode_input(col, input_df[col].values[0])

# Scale numerical features
input_df[input_df.select_dtypes(include=["int64", "float64"]).columns] = scaler.transform(input_df)

# Make prediction
predicted_class = model.predict(input_df)
decoded_prediction = label_encoders["Investment Recommendation"].inverse_transform(predicted_class)

print("\n🔹 Recommended Investment Option:", decoded_prediction[0])


🔹 Recommended Investment Option: Stocks: 50%; Mutual Funds - Small Cap: 30%; Mutual Funds - Mid Cap (Lumpsum): 20%
