In [1]:
import numpy as np
import pandas as pd

# Function to create the data structure
def data_structure(n):
    np.random.seed(42)  # Seed for reproducibility

    # Creating a DataFrame
    data = {
        'Entity ID': np.arange(1, n + 1),
        'Number of Clients Served Annually': np.random.choice(['<200', '201-500', '>500'], n, p=[0.3, 0.4, 0.3]),
        'Past Infraction History Type': np.random.choice(['None', 'Minor Infractions', 'Major Infractions'], n, p=[0.6, 0.3, 0.1]),
        'Past Infraction History Timeline': np.random.choice(['None', 'Within past year', '1-3 years ago'], n, p=[0.6, 0.2, 0.2]),
        'Public Complaints Last Quarter': np.random.choice(['None', 'Minor', 'Major'], n, p=[0.7, 0.2, 0.1]),
        'Quarterly Public Sentiment Analysis': np.random.choice(['None', 'Flagged'], n, p=[0.8, 0.2]),
        'Previous Inspection Results': np.random.choice(['Pass', 'Fail', 'None'], n, p=[0.5, 0.1, 0.4])
    }

    df = pd.DataFrame(data)

    # Print first few rows for verification
    # print(df.head())
    return df

# Function to calculate the risk score
def calculate_risk_score(row):
    score_map = {
        'Number of Clients Served Annually': {'<200': 1, '201-500': 2, '>500': 3},
        'Past Infraction History Type': {'None': 1, 'Minor Infractions': 2, 'Major Infractions': 3},
        'Past Infraction History Timeline': {'None': 1, 'Within past year': 2, '1-3 years ago': 3},
        'Public Complaints Last Quarter': {'None': 1, 'Minor': 2, 'Major': 3},
        'Quarterly Public Sentiment Analysis': {'None': 1, 'Flagged': 2},
        'Previous Inspection Results': {'Pass': 1, 'Fail': 2, 'None': 1}
    }

    # Sum the points for each category based on the row's values
    risk_score = sum(score_map[category][row[category]] for category in score_map)
    return risk_score

# Function to return the DataFrame with risk scores
def dummydf(n=5000):
    # Generate the data structure
    df = data_structure(n)

    # Apply the function to each row in the DataFrame to create the risk_score column
    df['Risk Score'] = df.apply(calculate_risk_score, axis=1)

    return df

# Generate the DataFrame
df = dummydf()
#print(df.head())


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Prepare the Data
X = df.drop(columns=['Entity ID', 'Risk Score'])
y = df['Risk Score']

# Encode categorical features
X_encoded = pd.get_dummies(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Train a Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the Model
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


Accuracy: 0.97
Precision: 0.97
Recall: 0.97
F1 Score: 0.97


In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Prepare the Data
X = df.drop(columns=['Entity ID', 'Risk Score'])
y = df['Risk Score']

# Encode categorical features
X_encoded = pd.get_dummies(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Train a Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the Model
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R2 Score: {r2:.2f}")


Mean Squared Error: 0.01
R2 Score: 0.99


In [4]:
import joblib

# Save the trained model
joblib.dump(model, 'risk_model.pkl')


['risk_model.pkl']

In [5]:
# Streamlit App
import streamlit as st
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import OneHotEncoder

# Load the trained model
model = joblib.load('risk_model.pkl')

# Function to preprocess input data
def preprocess_input(data, model):
    data_encoded = pd.get_dummies(data)
    return data_encoded.reindex(columns=model.feature_importances_.argsort(), fill_value=0)

# Streamlit app
st.title('Regulatory Risk Prediction Tool')

# User inputs
st.header('Enter Entity Details')
entity_id = st.text_input('Entity ID')
num_clients = st.selectbox('Number of Clients Served Annually', ['<200', '201-500', '>500'])
past_infraction_type = st.selectbox('Past Infraction History Type', ['None', 'Minor Infractions', 'Major Infractions'])
past_infraction_timeline = st.selectbox('Past Infraction History Timeline', ['None', 'Within past year', '1-3 years ago'])
public_complaints = st.selectbox('Public Complaints Last Quarter', ['None', 'Minor', 'Major'])
sentiment_analysis = st.selectbox('Quarterly Public Sentiment Analysis', ['None', 'Flagged'])
inspection_results = st.selectbox('Previous Inspection Results', ['Pass', 'Fail', 'None'])

# Create a dataframe for input
input_data = pd.DataFrame({
    'Number of Clients Served Annually': [num_clients],
    'Past Infraction History Type': [past_infraction_type],
    'Past Infraction History Timeline': [past_infraction_timeline],
    'Public Complaints Last Quarter': [public_complaints],
    'Quarterly Public Sentiment Analysis': [sentiment_analysis],
    'Previous Inspection Results': [inspection_results]
})

# Preprocess the input data
input_data_encoded = preprocess_input(input_data, model)

# Predict risk score
if st.button('Predict Risk Score'):
    risk_score = model.predict(input_data_encoded)[0]
    st.write(f'Predicted Risk Score: {risk_score}')

# Data visualization section
st.header('Data Visualization')

# Placeholder for data visualization
# You can add code here to visualize data using Plotly, Matplotlib, or other libraries

!streamlit run app.py


2024-05-22 19:37:17.091 
  command:

    streamlit run /Users/mahnaz/vscodeProjects/dataTalent/regulatory proj/venv/lib/python3.10/site-packages/ipykernel_launcher.py [ARGUMENTS]
2024-05-22 19:37:17.094 Session state does not function when running a script without `streamlit run`


[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8502[0m
[34m  Network URL: [0m[1mhttp://192.168.2.76:8502[0m
[0m
[34m[1m  For better performance, install the Watchdog module:[0m

  $ xcode-select --install
  $ pip install watchdog
            [0m


In [None]:
 # Streamlit App with Data Visualization Dashboard
import streamlit as st
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import OneHotEncoder
import plotly.express as px

# Load the trained model
model = joblib.load('risk_model.pkl')

# Function to preprocess input data
def preprocess_input(data, model):
    data_encoded = pd.get_dummies(data)
    return data_encoded.reindex(columns=model.feature_importances_.argsort(), fill_value=0)

# Streamlit app
st.title('Regulatory Risk Prediction Tool and Dashboard')

# Sidebar for navigation
st.sidebar.title("Navigation")
page = st.sidebar.radio("Go to", ["Prediction", "Data Visualization"])

if page == "Prediction":
    st.header('Enter Entity Details for Prediction')
    entity_id = st.text_input('Entity ID')
    num_clients = st.selectbox('Number of Clients Served Annually', ['<200', '201-500', '>500'])
    past_infraction_type = st.selectbox('Past Infraction History Type', ['None', 'Minor Infractions', 'Major Infractions'])
    past_infraction_timeline = st.selectbox('Past Infraction History Timeline', ['None', 'Within past year', '1-3 years ago'])
    public_complaints = st.selectbox('Public Complaints Last Quarter', ['None', 'Minor', 'Major'])
    sentiment_analysis = st.selectbox('Quarterly Public Sentiment Analysis', ['None', 'Flagged'])
    inspection_results = st.selectbox('Previous Inspection Results', ['Pass', 'Fail', 'None'])

    # Create a dataframe for input
    input_data = pd.DataFrame({
        'Number of Clients Served Annually': [num_clients],
        'Past Infraction History Type': [past_infraction_type],
        'Past Infraction History Timeline': [past_infraction_timeline],
        'Public Complaints Last Quarter': [public_complaints],
        'Quarterly Public Sentiment Analysis': [sentiment_analysis],
        'Previous Inspection Results': [inspection_results]
    })

    # Preprocess the input data
    input_data_encoded = preprocess_input(input_data, model)

    # Predict risk score
    if st.button('Predict Risk Score'):
        risk_score = model.predict(input_data_encoded)[0]
        st.write(f'Predicted Risk Score: {risk_score}')

elif page == "Data Visualization":
    st.header('Data Visualization Dashboard')

    # Load the dataset
    df = dummydf()

    # Calculate proportions of risk levels
    risk_levels = df['Risk Score'].apply(lambda x: 'Low' if x < 7 else 'Moderate' if x <= 12 else 'High')
    df['Risk Level'] = risk_levels
    risk_counts = df['Risk Level'].value_counts().reset_index()
    risk_counts.columns = ['Risk Level', 'Count']

    # Display risk level proportions
    st.subheader('Proportion of Risk Levels')
    fig = px.pie(risk_counts, names='Risk Level', values='Count', title='Proportion of Risk Levels')
    st.plotly_chart(fig)

    # Risk level distribution
    st.subheader('Risk Level Distribution')
    fig = px.histogram(df, x='Risk Score', nbins=10, title='Risk Score Distribution')
    st.plotly_chart(fig)

    # Drill-down capability
    st.subheader('Drill-Down on Entities')
    risk_level_filter = st.selectbox('Select Risk Level to View Details', options=['All', 'Low', 'Moderate', 'High'])

    if risk_level_filter != 'All':
        filtered_df = df[df['Risk Level'] == risk_level_filter]
    else:
        filtered_df = df

    st.dataframe(filtered_df)

    # Additional visualizations can be added here


2024-05-21 11:00:42.149 
  command:

    streamlit run /Users/mahnaz/vscodeProjects/dataTalent/regulatory proj/venv/lib/python3.10/site-packages/ipykernel_launcher.py [ARGUMENTS]
2024-05-21 11:00:42.151 Session state does not function when running a script without `streamlit run`
