# Dummy Data for Project

In [16]:
import pandas as pd
import numpy as np

np.random.seed(42)

# Number of unique clients and total issues
num_clients = 1200
total_issues = 10000

# Countries and their distribution
primary_countries = ['US', 'UK', 'Canada']
additional_countries = ['Germany', 'France', 'Australia', 'Japan', 'India', 'Brazil', 'South Africa']
all_countries = primary_countries + additional_countries


# Distribution: 50% US, 20% UK, 15% Canada, 15% for all other countries evenly
#country_distribution
raw_distribution = [0.5, 0.2, 0.15] + [(0.15 / len(additional_countries)) for _ in additional_countries]
normalized_distribution = np.array(raw_distribution) / np.sum(raw_distribution)

# Sample countries for clients
client_countries = np.random.choice(all_countries, num_clients, p=normalized_distribution)

# Country-specific states/provinces
country_states = {
    'US': ['California', 'Texas', 'Florida', 'New York', 'Illinois', 'Pennsylvania', 'Ohio', 'Georgia', 'North Carolina', 'Michigan'],
    'Canada': ['Ontario', 'Quebec', 'British Columbia', 'Alberta', 'Manitoba', 'Saskatchewan', 'Nova Scotia', 'New Brunswick', 'Newfoundland and Labrador', 'Prince Edward Island'],
    'UK': ['England', 'Scotland', 'Wales', 'Northern Ireland'],
    'Germany': ['Bavaria', 'Berlin', 'Hamburg'],
    'France': ['Île-de-France', 'Provence-Alpes-Côte d\'Azur'],
    'Australia': ['New South Wales', 'Victoria'],
    'Japan': ['Tokyo', 'Osaka'],
    'India': ['Maharashtra', 'Delhi'],
    'Brazil': ['São Paulo', 'Rio de Janeiro'],
    'South Africa': ['Gauteng', 'Western Cape'],
}

# Assign states based on country
def assign_state(country):
    states = country_states.get(country, ['Generic State'])
    return np.random.choice(states)

client_states = [assign_state(country) for country in client_countries]

# Generate other client data
client_ids = range(1, num_clients + 1)
client_names = [f"Client {i}" for i in client_ids]
money = np.random.uniform(1, 1000, num_clients).round(2)  # MONEY for each client

# DataFrame
clients_df = pd.DataFrame({
    'Client ID': client_ids,
    'Client Name': client_names,
    'Country': client_countries,
    'State': client_states,
    'Money ($M)': money,
})


# Date generation and concatenation segment
dates_2022 = pd.date_range(start="2022-01-01", end="2022-12-31").strftime("%m/%d/%Y")
dates_2023 = pd.date_range(start="2023-01-01", end="2023-12-31").strftime("%m/%d/%Y")
dates_2024_series = pd.Series(pd.date_range(start="2024-01-01", end="2024-12-31").strftime("%m/%d/%Y"))
dates_2025 = pd.date_range(start="2025-01-01", end="2025-12-31").strftime("%m/%d/%Y")
dates_2026 = pd.date_range(start="2026-01-01", end="2026-12-31").strftime("%m/%d/%Y")

# Duplicate 2024 dates to increase their representation
dates_2024_duplicated = pd.concat([dates_2024_series, dates_2024_series])

# Combine all dates into a single Series and shuffle
all_dates = pd.concat([pd.Series(dates_2022), pd.Series(dates_2023), dates_2024_duplicated, pd.Series(dates_2025), pd.Series(dates_2026)]).sample(frac=1, random_state=42)

# Select the first 10,000 dates
final_dates = all_dates[:10000].tolist()


# Statuses with most closed
statuses = np.random.choice(["Open", "Closed"], total_issues, p=[0.35, 0.65])


# Generate 10,000 total issues
issues_per_client = np.random.multinomial(total_issues, [1/num_clients]*num_clients)

issues_list = []
issue_counter = 0  # Counter to assign dates and statuses sequentially


for client_index, num_issues in enumerate(issues_per_client, start=1):
    for issue in range(num_issues):
        client_row = clients_df.loc[clients_df['Client ID'] == client_index].copy()
        client_row['Issue ID'] = f'Issue {issue_counter + 1}'
        client_row['Issue Date'] = dates[issue_counter]
        client_row['Status'] = statuses[issue_counter]
        client_row['Issue Value'] = np.random.randint(1, 100)  # Example issue-specific detail
        issues_list.append(client_row)
        issue_counter += 1  # Increment counter for each issue


# Combine all issues into a single DataFrame
all_issues_df = pd.concat(issues_list, ignore_index=True)

# Generate 'Type' for each issue based on specified distribution
types = ['Method 1', 'Method 2', 'Method 3']
type_distribution = [0.45, 0.45, 0.10]
investor_types = np.random.choice(types, total_issues, p=type_distribution)

# Add 'Type' column to all_issues_df
all_issues_df['Type'] = investor_types

# Now all_issues_df includes 'Issue Date', 'Status', and 'Type' columns
print(all_issues_df.head())


all_issues_df.to_csv(r'.....', index=False)


   Client ID Client Name Country State  Money ($M) Issue ID  Issue Date  \
0          1    Client 1      US  Ohio      654.52  Issue 1  05/31/2023   
1          1    Client 1      US  Ohio      654.52  Issue 2  03/24/2024   
2          1    Client 1      US  Ohio      654.52  Issue 3  05/13/2024   
3          1    Client 1      US  Ohio      654.52  Issue 4  11/05/2023   
4          1    Client 1      US  Ohio      654.52  Issue 5  12/20/2022   

   Status  Issue Value      Type  
0    Open           12  Method 1  
1  Closed           88  Method 2  
2  Closed           77  Method 2  
3    Open           66  Method 2  
4    Open           54  Method 2  
