In [17]:
import pandas as pd
import numpy as np

# Generating the dataset
np.random.seed(42)

n_committees = 5000
committee_ids = np.arange(1, n_committees + 1)
committee_members = np.random.randint(10, 100, size=n_committees)
recent_posts = np.random.randint(5, 60, size=n_committees)
events = np.random.randint(5, 30, size=n_committees)
user_rating = np.round(np.random.uniform(1.0, 10.0, size=n_committees), 1)

# Creating a DataFrame
df = pd.DataFrame({
    'committee_id': committee_ids,
    'committee_members': committee_members,
    'recent_posts': recent_posts,
    'events': events,
    'user_rating': user_rating
})


# Saving the dataset to a CSV file
df.to_csv('committee_data.csv', index=False)

# Confirming the file is saved
print("Dataset saved as 'committee_data.csv'")


Dataset saved as 'committee_data.csv'


In [43]:
import pandas as pd
import numpy as np

# Set seed for reproducibility
np.random.seed(42)

# Define the number of committees
n_committees = 5000

# Committee ID (simple sequential numbers)
committee_ids = np.arange(1, n_committees + 1)

# Committee members (normal distribution with higher variance)
committee_members = np.random.normal(loc=30, scale=15, size=n_committees).astype(int)
# Clip the values between 10 and 100
committee_members = np.clip(committee_members, 3, 100)



# Recent posts (Gamma distribution)
# We define the shape and scale for the Gamma distribution
shape = 2.0  # Shape parameter, higher values make the distribution more spread out
scale = 10   # Scale parameter, it affects the average value

recent_posts = np.random.gamma(shape, scale, size=n_committees)

# Ensure values are between 5 and 60
recent_posts = np.clip(recent_posts, 3.5, 60)


# Events (normal distribution with skew for some committees to have many events)
events = np.random.normal(loc=15, scale=5, size=n_committees).astype(int)
events = np.clip(events, 5, 30)  # Ensure values between 5 and 30

# User rating (normal distribution centered around 7 with some skew)
user_rating = np.random.normal(loc=7.0, scale=2.0, size=n_committees).round(1)
user_rating = np.clip(user_rating, 1.0, 10.0)  # Ensure values between 1.0 and 10.0

# Introduce correlation between committee members and events
# More members might lead to more events, hence a positive correlation
events = events + (committee_members - 30) // 3  # Slight positive correlation

# Create DataFrame
df = pd.DataFrame({
    'committee_id': committee_ids,
    'committee_members': committee_members,
    'recent_posts': recent_posts,
    'events': events,
    'user_rating': user_rating
})

# Saving the dataset to a CSV file
df.to_csv('committee_data_realistic.csv', index=False)

# Confirming the file is saved
print("Dataset saved as 'committee_data_realistic.csv'")


Dataset saved as 'committee_data_realistic.csv'
