In [None]:
import random
import pandas as pd

# Define the number of entities
num_entities = 5000

# Define value ranges and probabilities
clients_range = [(0, 200, 40), (201, 500, 30), (501, 1000, 30)]
infraction_types = ['None', 'Minor', 'Major']
infraction_type_probs = [0.5, 0.3, 0.2]
infraction_timelines = ['None', 'Within past year', 'Within past 1-3 years']
infraction_timeline_probs = [0.5, 0.25, 0.25]
complaints = ['None', 'Minor', 'Major']
complaints_probs = [0.6, 0.25, 0.15]
sentiments = ['None', 'Flagged']
sentiments_probs = [0.7, 0.3]
inspection_results = ['Pass', 'Fail', 'None']
inspection_results_probs = [0.5, 0.25, 0.25]

# Function to generate a random number of clients
def random_clients():
    choice = random.choices(clients_range, [prob for _, _, prob in clients_range])[0]
    return random.randint(choice[0], choice[1])

# Generate the data
data = []
for i in range(1, num_entities + 1):
    entity_id = f"E{i:04d}"
    annual_clients = random_clients()
    infraction_type = random.choices(infraction_types, infraction_type_probs)[0]
    infraction_timeline = random.choices(infraction_timelines, infraction_timeline_probs)[0]
    public_complaints = random.choices(complaints, complaints_probs)[0]
    sentiment_analysis = random.choices(sentiments, sentiments_probs)[0]
    inspection_result = random.choices(inspection_results, inspection_results_probs)[0]

    # Calculate the risk score
    risk_score = 0
    risk_score += 1 if annual_clients < 200 else 2 if annual_clients <= 500 else 3
    risk_score += 0 if infraction_type == 'None' else 2 if infraction_type == 'Minor' else 3
    risk_score += 0 if infraction_timeline == 'None' else 3 if infraction_timeline == 'Within past year' else 2
    risk_score += 0 if public_complaints == 'None' else 2 if public_complaints == 'Minor' else 3
    risk_score += 0 if sentiment_analysis == 'None' else 3
    risk_score += 0 if inspection_result == 'Pass' else 3 if inspection_result == 'Fail' else 1

    data.append([
        entity_id, annual_clients, infraction_type, infraction_timeline,
        public_complaints, sentiment_analysis, inspection_result, risk_score
    ])

# Create a DataFrame
df = pd.DataFrame(data, columns=[
    'Entity ID', 'Annual Clients', 'Infraction Type', 'Infraction Timeline',
    'Public Complaints', 'Sentiment Analysis', 'Inspection Results', 'Total Risk Score'
])

# Save to CSV
df.to_csv('data.csv', index=False)


: 

In [None]:
df.head()

: 

: 