In [None]:
!pip install faker
import pandas as pd
import random
from faker import Faker

fake = Faker()

#User Table
data_users = []
for i in range(1, 1501):
    user_id = f'U{str(i).zfill(4)}'
    data_users.append([
        user_id,
        fake.user_name(),
        random.choice(['Basic', 'Standard', 'Premium']),
        fake.email(),
        fake.country(),
        fake.date_of_birth(minimum_age=18, maximum_age=80),
        random.randint(1, 5)
    ])
users_df = pd.DataFrame(data_users, columns=['User_ID', 'Username', 'Subscription_Plan', 'Email', 'Country', 'Date_of_Birth', 'Number_of_Devices'])

#Content Table
data_content = []
for i in range(1, 501):
    content_id = f'CID{str(i).zfill(4)}'
    data_content.append([
        content_id,
        fake.sentence(nb_words=3),
        random.choice(['Action', 'Comedy', 'Drama', 'Horror', 'Sci-Fi']),
        random.randint(1950, 2024),
        random.randint(30, 180)
    ])
content_df = pd.DataFrame(data_content, columns=['Content_ID', 'Title', 'Genre', 'Release_Year', 'Duration'])

#Watch History Table
data_history = []
for _ in range(1000):
    user_id = f'U{str(random.randint(1, 1500)).zfill(4)}'
    content_id = f'CID{str(random.randint(1, 500)).zfill(4)}'
    data_history.append([
        user_id,
        content_id,
        random.choice(['Not Started', 'In Progress', 'Completed']),
        random.randint(1, 200)
    ])
watch_history_df = pd.DataFrame(data_history, columns=['User_ID', 'Content_ID', 'Completion_Status', 'Minutes_Watched'])

#Transactions Table
data_transactions = []
for i in range(1, 1501):
    user_id = f'U{str(i).zfill(4)}'
    data_transactions.append([
        user_id,
        fake.date_between(start_date='-3y', end_date='today'),
        random.choice(['Paid', 'Pending', 'Failed']),
        random.choice([5.99, 9.99, 14.99]),
        random.choice(['Credit Card', 'PayPal', 'Bank Transfer', 'Google Pay', 'Apple Pay', 'Cryptocurrency'])
    ])
transactions_df = pd.DataFrame(data_transactions, columns=['User_ID', 'Transaction_Date', 'Transaction_Status', 'Amount', 'Payment_Method'])


#Missing and Duplicate Data
for df, missing_count in zip([users_df, content_df, watch_history_df, transactions_df], [50, 20, 30, 40]):
    for col in df.columns:
        df.loc[np.random.choice(df.index, missing_count, replace=False), col] = np.nan


users_df = pd.concat([users_df, users_df.sample(20, replace=False)], ignore_index=True)
content_df = pd.concat([content_df, content_df.sample(10, replace=False)], ignore_index=True)
watch_history_df = pd.concat([watch_history_df, watch_history_df.sample(15, replace=False)], ignore_index=True)
transactions_df = pd.concat([transactions_df, transactions_df.sample(25, replace=False)], ignore_index=True)


print(users_df.head())
print(content_df.head())
print(watch_history_df.head())
print(transactions_df.head())


# Saving as CSV
users_df.to_csv('users.csv', index=False)
content_df.to_csv('content.csv', index=False)
watch_history_df.to_csv('watch_history.csv', index=False)
transactions_df.to_csv('transactions.csv', index=False)

print("CSV files successfully created.")
