<a href="https://colab.research.google.com/github/ShailenderGoyal/Team_2_-Salesforce_Hackathon/blob/main/Enigmatrix_ML_APIs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Cell 1: Install Dependencies
!pip install xgboost lightgbm shap fastapi uvicorn pyngrok scikit-learn pandas numpy joblib nest_asyncio -q
!pip install fastapi uvicorn nest_asyncio pyngrok langchain sentence-transformers google-generativeai

In [None]:
# Cell 2: Import Libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import xgboost as xgb
import lightgbm as lgb
import shap
import joblib
import nest_asyncio
import uvicorn
from fastapi import FastAPI
from pydantic import BaseModel
from pyngrok import ngrok
import json
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Cell 3: Generate Synthetic Data
features = {
    'cdr_call_frequency': 0.35,
    'cdr_call_duration': 0.30,
    'num_contacts': 0.25,
    'upi_transaction_frequency': 0.40,
    'avg_mobile_wallet_balance': 0.38,
    'bill_payment_timeliness': 0.45,
    'residential_stability': 0.30,
    'health_insurance': 0.28,
    'app_usage_hours_per_week': 0.20,
    'number_of_livestock': 0.16,
    'shg_repayment_consistency': 0.42,
    'age_years': 0.10,
    'num_dependents': -0.10,
    'subsidy_inflow_regularity': 0.35,
    'land_holdings_acres': 0.38,
    'num_two_wheelers': 0.25,
    'num_four_wheelers': 0.32,
    'gold_holding_value': 0.28,
    'num_formal_loans': 0.30,
    'num_informal_loans': -0.35,
    'marital_status': 0.05
}

def generate_updated_synthetic_data(n=10000, seed=42):
    np.random.seed(seed)
    feature_names = list(features.keys())
    correlations = np.array([features[f] for f in feature_names])
    corr_matrix = np.outer(correlations, correlations)
    np.fill_diagonal(corr_matrix, 1.0)
    mean = np.zeros(len(feature_names))
    raw_data = np.random.multivariate_normal(mean, corr_matrix, size=n)
    df = pd.DataFrame(raw_data, columns=feature_names)

    df['cdr_call_frequency'] = np.clip(np.exp(df['cdr_call_frequency']) * 5, 5, 150).round()
    df['cdr_call_duration'] = np.clip(np.exp(df['cdr_call_duration']) * 2, 5, 600)
    df['num_contacts'] = np.clip(np.exp(df['num_contacts']) * 2, 10, 250).round()
    df['upi_transaction_frequency'] = np.clip(np.exp(df['upi_transaction_frequency']) * 2, 5, 250).round()
    df['avg_mobile_wallet_balance'] = np.clip(1500 + 500 * df['avg_mobile_wallet_balance'], 0, None)
    df['bill_payment_timeliness'] = (df['bill_payment_timeliness'] - df['bill_payment_timeliness'].min()) / (df['bill_payment_timeliness'].max() - df['bill_payment_timeliness'].min())
    df['residential_stability'] = np.clip(5 + 2 * df['residential_stability'], 0, None)
    df['health_insurance'] = np.clip(1 + df['health_insurance'], 0, 1)
    df['app_usage_hours_per_week'] = np.clip(np.exp(df['app_usage_hours_per_week']), 0, 80)
    df['number_of_livestock'] = np.clip(np.exp(df['number_of_livestock']), 0, 50).round()
    df['shg_repayment_consistency'] = (df['shg_repayment_consistency'] - df['shg_repayment_consistency'].min()) / (df['shg_repayment_consistency'].max() - df['shg_repayment_consistency'].min())
    df['age_years'] = np.clip(40 + 10 * df['age_years'], 18, 90).round()
    df['num_dependents'] = np.clip(df['num_dependents'] + 2, 0, 10).round()
    df['subsidy_inflow_regularity'] = (df['subsidy_inflow_regularity'] - df['subsidy_inflow_regularity'].min()) / (df['subsidy_inflow_regularity'].max() - df['subsidy_inflow_regularity'].min())
    df['land_holdings_acres'] = np.clip(np.exp(df['land_holdings_acres']), 0, 20)
    df['num_two_wheelers'] = np.clip(df['num_two_wheelers'] + 1, 0, 3).round()
    df['num_four_wheelers'] = np.clip(df['num_four_wheelers'], 0, 2).round()
    df['gold_holding_value'] = np.clip(10 + 5 * df['gold_holding_value'], 0, None)
    df['num_formal_loans'] = np.clip(df['num_formal_loans'] + 1, 0, 10).round()
    df['num_informal_loans'] = np.clip(df['num_informal_loans'] + 1, 0, 5).round()
    df['marital_status'] = np.random.choice([0, 1], size=n, p=[0.4, 0.6])

    return df

df = generate_updated_synthetic_data()
scaler = MinMaxScaler()
scaled = scaler.fit_transform(df[list(features.keys())])
weights = np.array(list(features.values()))
weighted_sum = np.dot(scaled, weights)
df['credit_score'] = (1 + 99 * (weighted_sum - weighted_sum.min()) / (weighted_sum.max() - weighted_sum.min()))
df['credit_score'] = df['credit_score'].round().astype(int)

In [None]:
# Cell 4: Train-Test Split
X = df.drop(columns=['credit_score'])
y = df['credit_score']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Cell 5: Train Ensemble Model
xgb_model = xgb.XGBRegressor(tree_method='gpu_hist', gpu_id=0)
lgb_model = lgb.LGBMRegressor(device='gpu')
xgb_model.fit(X_train, y_train)
lgb_model.fit(X_train, y_train)

In [None]:
# Cell 6: SHAP Explanation
explainer = shap.Explainer(xgb_model)
shap_values = explainer(X_test[:100])
mean_abs_shap = np.abs(shap_values.values).mean(axis=0)
scaler_shap = MinMaxScaler(feature_range=(1, 100))
importance_scaled = scaler_shap.fit_transform(mean_abs_shap.reshape(-1, 1)).flatten()
feature_importance_dict = dict(zip(X.columns, importance_scaled.round(2)))


In [None]:
# Cell 7: Save Models
joblib.dump(xgb_model, "xgb_model.pkl")
joblib.dump(lgb_model, "lgb_model.pkl")
joblib.dump(feature_importance_dict, "feature_importance.pkl")

In [None]:
pip install langchain_community

In [None]:
from langchain_community.vectorstores.faiss import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document as LangchainDocument

import google.generativeai as genai
import os
import importlib
import datetime

# add you gemini api key
os.environ["GOOGLE_API_KEY"] = "your_key_here"
class RAGGeminiSystem:
    def __init__(self):
        self.embedding_model = None
        self.vector_store = None

        self.embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=500,
            chunk_overlap=50,
            length_function=len
        )

        self._setup_gemini_api()
        print("RAG-Gemini system initialized.")

    def _setup_gemini_api(self):
        try:
            api_key = os.getenv("GOOGLE_API_KEY")
            if not api_key:
                raise ValueError("Missing GOOGLE_API_KEY environment variable.")
            genai.configure(api_key=api_key)

            self.gemini_model = genai.GenerativeModel("models/gemini-2.0-flash-lite")
        except Exception as e:
            raise RuntimeError(f"Gemini API initialization failed: {str(e)}")

    def _load_embedding_model(self):
        if self.embedding_model is None:
            print("🔄 Loading embedding model...")
            try:
                importlib.invalidate_caches()
                self.embedding_model = HuggingFaceEmbeddings(
                    model_name=self.embedding_model_name
                )
                print("Embedding model loaded.")
            except ImportError:
                raise RuntimeError(
                    "sentence-transformers is not installed. Run: pip install sentence-transformers"
                )

    def add_documents(self, documents):
        try:
            self._load_embedding_model()

            all_chunks = []
            for doc in documents:
                chunks = self.text_splitter.split_text(doc["content"])
                for chunk in chunks:
                    all_chunks.append(
                        LangchainDocument(page_content=chunk, metadata=doc["metadata"])
                    )

            if not all_chunks:
                return {"status": "error", "message": "No valid document content to index."}

            if self.vector_store is None:
                self.vector_store = FAISS.from_documents(all_chunks, self.embedding_model)
            else:
                self.vector_store.add_documents(all_chunks)

            return {
                "status": "success",
                "message": f"Added {len(documents)} documents with {len(all_chunks)} total chunks."
            }

        except Exception as e:
            return {"status": "error", "message": f"Failed to add documents: {str(e)}"}

    def answer_question(self, question, top_k=3, store_response=True):
        try:
            if self.vector_store is None:
                return {"status": "error", "message": "Knowledge base is empty."}

            self._load_embedding_model()

            docs = self.vector_store.similarity_search(question, k=top_k)
            contexts = [doc.page_content for doc in docs]
            combined_context = "\n\n".join(contexts)

            prompt = (
                f"You are a helpful assistant with access to the following context:\n\n"
                f"{combined_context}\n\n"
                f"Based on the above information, answer the following question and respond cleanly without any markup symbols:\n"
                f"{question}"
            )

            # Use Gemini to generate response
            response = self.gemini_model.generate_content(prompt)

            if not hasattr(response, "text") or not response.text.strip():
                answer = "I don't have enough information to answer that question."
            else:
                answer = response.text.strip()

            # Optionally add Gemini's answer to vector DB
            if store_response and answer:
                metadata = {
                    "source": "gemini_response",
                    "question": question,
                    "timestamp": datetime.datetime.now().isoformat()
                }
                doc = {"content": answer, "metadata": metadata}
                self.add_documents([doc])

            return {
                "status": "success",
                "answer": answer,
                "sources": [{"content": doc.page_content, "metadata": doc.metadata} for doc in docs]
            }

        except Exception as e:
            return {"status": "error", "message": f"Failed to answer question: {str(e)}"}


In [None]:
# Cell 8: Define FastAPI Inference App with Credit Scoring + RAG-Gemini Support
!pip install fastapi uvicorn nest_asyncio pyngrok shap sentence-transformers langchain google-generativeai

from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
import uvicorn
import nest_asyncio
from pyngrok import ngrok
import shap
import numpy as np
import pandas as pd

# === Credit Model Setup ===
explainer = shap.Explainer(xgb_model)

# Initialize FastAPI and CORS
app = FastAPI()
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"],
)

# === Credit Prediction Schema ===
class InputFeatures(BaseModel):
    cdr_call_frequency: float
    cdr_call_duration: float
    num_contacts: float
    upi_transaction_frequency: float
    avg_mobile_wallet_balance: float
    bill_payment_timeliness: float
    residential_stability: float
    health_insurance: float
    app_usage_hours_per_week: float
    number_of_livestock: float
    shg_repayment_consistency: float
    age_years: float
    num_dependents: float
    subsidy_inflow_regularity: float
    land_holdings_acres: float
    num_two_wheelers: float
    num_four_wheelers: float
    gold_holding_value: float
    num_formal_loans: float
    num_informal_loans: float
    marital_status: int

@app.post("/predict")
def predict_credit_score(features: InputFeatures):
    input_dict = features.dict()
    input_df = pd.DataFrame([input_dict])

    # Predict using ensemble
    xgb_pred = xgb_model.predict(input_df)[0]
    lgbm_pred = lgb_model.predict(input_df)[0]
    final_pred = (0.5 * xgb_pred) + (0.5 * lgbm_pred)

    # SHAP explanation
    shap_values = explainer(input_df)
    abs_vals = np.abs(shap_values.values[0])
    max_val = abs_vals.max() if abs_vals.max() != 0 else 1.0

    normalized_scores = {
        feature: int(1 + 99 * (val / max_val))
        for feature, val in zip(input_df.columns, abs_vals)
    }

    return {
        "prediction": float(final_pred),
        "shap_scores": normalized_scores
    }



# Instantiate shared RAG system
rag_system = RAGGeminiSystem()

class DocumentItem(BaseModel):
    content: str
    metadata: dict = {}

class AddDocumentsRequest(BaseModel):
    documents: list[DocumentItem]

class AskQuestionRequest(BaseModel):
    question: str
    top_k: int = 3
    store_response: bool = True

@app.post("/rag/add_documents")
async def add_documents(request: AddDocumentsRequest):
    docs = [{"content": d.content, "metadata": d.metadata} for d in request.documents]
    return rag_system.add_documents(docs)

@app.post("/rag/ask_question")
async def ask_question(request: AskQuestionRequest):
    return rag_system.answer_question(
        question=request.question,
        top_k=request.top_k,
        store_response=request.store_response
    )




In [None]:
#deployment
import nest_asyncio
import uvicorn
import os
from pyngrok import ngrok


NGROK_AUTH_TOKEN = "your_actual_token_here" # Replace this with your own token
ngrok.set_auth_token(NGROK_AUTH_TOKEN)
# Create a public URL
public_url = ngrok.connect(8056)
print(f"Public URL: {public_url}")

# Apply nest_asyncio to allow nested event loops
nest_asyncio.apply()

# Start the FastAPI server
uvicorn.run(app, host="0.0.0.0", port=8056)