In [None]:
import pandas as pd
import numpy as np
import faker
import random
from datetime import datetime, timedelta
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class CustomerGenerator:
    def __init__(self, num_customers=1000):
        """
        Initialize the CustomerGenerator with a specified number of customers.
        
        :param num_customers: Number of customers to generate.
        """
        self.fake = faker.Faker()
        self.num_customers = num_customers

    def format_phone_number(self):
        """
        Format a phone number in the (XXX) XXX-XXXX format.
        
        :return: Formatted phone number.
        """
        phone_number = self.fake.phone_number()
        digits = ''.join(filter(str.isdigit, phone_number))
        return f"({digits[:3]}) {digits[3:6]}-{digits[6:10]}"

    def random_date(self, start_date, end_date):
        """
        Generate a random date between start_date and end_date.
        
        :param start_date: Start date in the format 'YYYY-MM-DD'.
        :param end_date: End date in the format 'YYYY-MM-DD'.
        :return: Random date as a datetime object.
        """
        try:
            start_dt = datetime.strptime(start_date, "%Y-%m-%d")
            end_dt = datetime.strptime(end_date, "%Y-%m-%d")
            return start_dt + timedelta(days=random.randint(0, (end_dt - start_dt).days))
        except ValueError as e:
            logging.error(f"Date parsing error: {e}")
            raise

    def generate_customers(self):
        """
        Generate a DataFrame containing customer data.
        
        :return: DataFrame with customer data.
        """
        try:
            # Generate customer attributes
            names = [self.fake.name() for _ in range(self.num_customers)]
            first_names = [name.split()[0] for name in names]
            last_names = [name.split()[-1] for name in names]
            addresses = [self.fake.address().replace('\n', ', ') for _ in range(self.num_customers)]
            email_providers = ["gmail.com", "yahoo.com", "outlook.com", "hotmail.com", "aol.com"]
            emails = [f"{first.lower()}.{last.lower()}@{random.choice(email_providers)}" for first, last in zip(first_names, last_names)]
            phone_numbers = [self.format_phone_number() for _ in range(self.num_customers)]
            join_dates = sorted([self.random_date("2017-01-01", "2022-12-31").strftime("%Y-%m-%d") for _ in range(self.num_customers)])
            last_updates = [self.random_date("2023-01-01", "2023-12-31").strftime("%Y-%m-%d %H:%M:%S") for _ in range(self.num_customers)]
            credit_scores = [random.randint(500, 850) for _ in range(self.num_customers)]
            customer_ids = [f"C{index+1000:04d}" for index in range(self.num_customers)]
            customer_types = random.choices(["regular", "VIP"], k=self.num_customers, weights=[0.8, 0.2])

            # Create DataFrame
            customers_df = pd.DataFrame({
                "customer_id": customer_ids,
                "name": names,
                "email": emails,
                "phone": phone_numbers,
                "address": addresses,
                "credit_score": credit_scores,
                "join_date": pd.to_datetime(join_dates),
                "last_update": last_updates,
                "customer_type": customer_types
            })

            logging.info("Customers generated successfully.")
            return customers_df

        except Exception as e:
            logging.error(f"Error generating customers: {e}")
            raise

# Example Usage:
customer_gen = CustomerGenerator(num_customers=1000)
customers_df = customer_gen.generate_customers()


2024-07-30 13:19:09,830 - INFO - Customers generated successfully.


In [None]:
customers_df

Unnamed: 0,customer_id,name,email,phone,address,credit_score,join_date,last_update,customer_type
0,C1000,John Hood,john.hood@hotmail.com,(001) 349-5707,"092 Hannah Cape Apt. 377, Beverlyview, NV 47130",651,2017-01-03,2023-04-22 00:00:00,regular
1,C1001,Jennifer Odonnell,jennifer.odonnell@outlook.com,(001) 377-2435,"133 English Street, West Clayton, GU 35895",734,2017-01-04,2023-06-07 00:00:00,VIP
2,C1002,David Lopez,david.lopez@yahoo.com,(001) 321-7133,"4788 Luke Harbors, West Robertland, MT 67368",739,2017-01-05,2023-01-04 00:00:00,regular
3,C1003,Kristen Mcneil,kristen.mcneil@gmail.com,(437) 469-8181,"3546 Mcdonald Centers Suite 610, Cherylburgh, ...",798,2017-01-09,2023-10-15 00:00:00,regular
4,C1004,Danielle Clark,danielle.clark@gmail.com,(443) 234-8399,"3407 Elizabeth Shoals Apt. 378, Martinburgh, I...",536,2017-01-10,2023-09-21 00:00:00,regular
...,...,...,...,...,...,...,...,...,...
995,C1995,Michael Anderson,michael.anderson@gmail.com,(399) 409-6565,"74877 Jennifer Parkway Apt. 398, South Luis, O...",716,2022-12-25,2023-10-02 00:00:00,regular
996,C1996,Claire Alvarez,claire.alvarez@yahoo.com,(962) 676-6243,"6590 Davis Gardens Suite 270, Jonesstad, NM 38632",645,2022-12-26,2023-11-16 00:00:00,regular
997,C1997,Jesse Torres,jesse.torres@hotmail.com,(988) 378-6363,"Unit 7900 Box 4872, DPO AA 84224",589,2022-12-27,2023-12-30 00:00:00,regular
998,C1998,James Ward,james.ward@yahoo.com,(756) 850-7297,"000 John Way Apt. 826, Millsfort, FM 18829",641,2022-12-29,2023-10-27 00:00:00,regular


In [None]:
customers_df.to_csv(r"C:\Users\sahill\Desktop\Big data\CapstoneProject\Data\Customers.csv", index=False, header=True)

# Branches data

In [None]:
import pandas as pd
import random
import faker
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class BranchGenerator:
    def __init__(self, num_branches=15):
        """
        Initialize the BranchGenerator with a specified number of branches.
        
        :param num_branches: Number of branches to generate.
        """
        self.fake = faker.Faker()
        self.num_branches = num_branches
        self.location_timezone_mapping = {
            "New York": "EST",
            "Los Angeles": "PST",
            "Chicago": "CST",
            "Houston": "CST",
            "Phoenix": "MST",
            "Philadelphia": "EST",
            "San Antonio": "CST",
            "San Diego": "PST",
            "Dallas": "CST",
            "San Jose": "PST",
            "London": "GMT",
            "Berlin": "CET",
            "Paris": "CET",
            "Madrid": "CET",
            "Rome": "CET",
            "Vienna": "CET",
            "Sydney": "AEST",
            "Melbourne": "AEST",
            "Brisbane": "AEST",
            "Perth": "AWST"
        }

    def generate_branches(self):
        """
        Generate a DataFrame containing branch data.
        
        :return: DataFrame with branch data.
        """
        try:
            if self.num_branches > len(self.location_timezone_mapping):
                raise ValueError("Number of branches exceeds available locations")

            branch_ids = [f"B{index:04d}" for index in range(self.num_branches)]
            branch_names = [f"{self.fake.city()} Branch" for _ in range(self.num_branches)]
            branch_locations = random.sample(list(self.location_timezone_mapping.keys()), self.num_branches)
            branch_timezones = [self.location_timezone_mapping[loc] for loc in branch_locations]
            branch_currencies = ["USD", "EUR", "GBP", "AUD"]
            branch_currency_mapping = {loc: random.choice(branch_currencies) for loc in branch_locations}

            branches_df = pd.DataFrame({
                "branch_id": branch_ids,
                "name": branch_names,
                "location": branch_locations,
                "timezone": branch_timezones,
                "currency": [branch_currency_mapping[loc] for loc in branch_locations]
            })

            logging.info("Branches generated successfully.")
            return branches_df

        except ValueError as e:
            logging.error(f"ValueError: {e}")
            raise
        except Exception as e:
            logging.error(f"Error generating branches: {e}")
            raise

# Example Usage:
branch_gen = BranchGenerator()
branches_df = branch_gen.generate_branches()
print(branches_df)


2024-07-30 13:19:13,282 - INFO - Branches generated successfully.


   branch_id                           name      location timezone currency
0      B0000              Keithmouth Branch      San Jose      PST      EUR
1      B0001             Nathanhaven Branch          Rome      CET      EUR
2      B0002             Melindafort Branch     San Diego      PST      GBP
3      B0003            New Ruthport Branch        London      GMT      USD
4      B0004             Wrightshire Branch        Madrid      CET      USD
5      B0005         Lake Lindaville Branch       Houston      CST      GBP
6      B0006          Port Hollyview Branch        Sydney     AEST      AUD
7      B0007            East Heather Branch  Philadelphia      EST      GBP
8      B0008            South Steven Branch        Berlin      CET      AUD
9      B0009  North Christopherhaven Branch         Perth     AWST      EUR
10     B0010          Port Frankbury Branch   Los Angeles      PST      GBP
11     B0011             Thomasburgh Branch      Brisbane     AEST      USD
12     B0012

In [None]:
branches_df.to_csv(r"C:\Users\sahill\Desktop\Big data\CapstoneProject\Data\Branches.csv", index=False, header=True)

# Transaction data

In [None]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
from sklearn.datasets import make_regression
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class TransactionGenerator:
    def __init__(self, num_transactions=10000):
        """
        Initialize the TransactionGenerator with a specified number of transactions.
        
        :param num_transactions: Number of transactions to generate.
        """
        self.num_transactions = num_transactions

    def generate_transactions(self, customers_df, branches_df):
        """
        Generate transactions based on customer and branch data.

        :param customers_df: DataFrame containing customer information.
        :param branches_df: DataFrame containing branch information.
        :return: DataFrame containing generated transactions.
        """
        try:
            customer_ids = customers_df["customer_id"].tolist()
            join_dates = pd.to_datetime(customers_df["join_date"])
            branch_ids = branches_df["branch_id"].tolist()
            branch_currencies = branches_df.set_index("branch_id")["currency"].to_dict()

            transaction_ids = [f"T{index:04d}" for index in range(5000, 5000+self.num_transactions)]
            X, y = make_regression(n_samples=self.num_transactions, n_features=1, noise=10)
            amounts = np.clip(np.abs(y), 1, 10000)

            num_outliers = int(0.05 * len(amounts))
            outliers_indices = np.random.choice(len(amounts), num_outliers, replace=False)
            amounts[outliers_indices] = np.random.uniform(10000, 100000, size=num_outliers)
            amounts = np.round(amounts, 2)

            num_completed = int(0.85 * self.num_transactions)
            num_pending = int(0.1 * self.num_transactions)
            num_denied = int(0.05 * self.num_transactions)

            status_values = ["completed"] * num_completed + ["pending"] * num_pending + ["denied"] * num_denied
            random.shuffle(status_values)

            if len(status_values) < self.num_transactions:
                status_values += ["completed"] * (self.num_transactions - len(status_values))
            elif len(status_values) > self.num_transactions:
                status_values = status_values[:self.num_transactions]

            transaction_data = []
            previous_transaction_timestamp = pd.Timestamp("2018-01-01")  # Starting point

            for idx in range(self.num_transactions):
                transaction_date = previous_transaction_timestamp + timedelta(minutes=random.randint(1, 60))
                transaction_timestamp = transaction_date + timedelta(minutes=random.randint(0, 60))

                previous_transaction_timestamp = transaction_timestamp

                eligible_customers = customers_df[customers_df["join_date"] <= transaction_timestamp]
                if not eligible_customers.empty:
                    customer = eligible_customers.sample(n=1).iloc[0]
                    customer_id = customer["customer_id"]
                    branch_id = random.choice(branch_ids)
                    branch_currency = branch_currencies[branch_id]

                    if transaction_timestamp.weekday() < 5 and 9 <= transaction_timestamp.hour < 17:
                        channel = random.choice(["ATM", "web", "mobile", "branch"])
                    else:
                        channel = random.choice(["ATM", "web", "mobile"])

                    if 0 <= transaction_timestamp.hour < 6:
                        transaction_type = random.choice(["withdrawal", "deposit"])
                    elif 6 <= transaction_timestamp.hour < 12:
                        transaction_type = random.choice(["payment", "deposit"])
                    elif 12 <= transaction_timestamp.hour < 18:
                        transaction_type = random.choice(["transfer", "withdrawal"])
                    else:
                        transaction_type = random.choice(["payment", "withdrawal"])

                    transaction_data.append({
                        "transaction_id": transaction_ids[idx],
                        "customer_id": customer_id,
                        "branch_id": branch_id,
                        "channel": channel,
                        "transaction_type": transaction_type,
                        "amount": amounts[idx],
                        "currency": branch_currency,
                        "timestamp": transaction_timestamp,
                        "status": status_values[idx]
                    })

            transactions_df = pd.DataFrame(transaction_data)
            logging.info("Transactions generated successfully.")
            return transactions_df

        except KeyError as e:
            logging.error(f"Missing expected column: {e}")
            raise
        except Exception as e:
            logging.error(f"Error generating transactions: {e}")
            raise

    def add_null_values(self, transactions_df):
        """
        Introduce null values into specific columns of the transactions DataFrame.

        :param transactions_df: DataFrame containing transactions.
        :return: DataFrame with null values added.
        """
        columns_with_nulls = ["amount", "currency", "channel"]

        for col in columns_with_nulls:
            null_prob = np.random.uniform(0, 0.03)
            null_indices = np.random.choice(transactions_df.index, int(null_prob * self.num_transactions), replace=False)
            transactions_df.loc[null_indices, col] = None

        transactions_df.loc[transactions_df["amount"].isna(), "status"] = "pending"
        transactions_df.loc[transactions_df["currency"].isna(), "status"] = "pending"
        transactions_df.loc[transactions_df["channel"].isna(), "status"] = "pending"

        logging.info("Null values added successfully.")
        return transactions_df

    def add_duplicates(self, transactions_df):
        """
        Add duplicate transactions to the DataFrame.

        :param transactions_df: DataFrame containing transactions.
        :return: DataFrame with duplicates added.
        """
        num_duplicates = int(self.num_transactions * np.random.uniform(0, 0.02))

        pending_transactions = transactions_df[transactions_df["status"] == "pending"]

        if num_duplicates > 0 and not pending_transactions.empty:
            duplicates_to_add = pending_transactions.sample(n=min(num_duplicates, len(pending_transactions)), replace=True).copy()
            duplicates_to_add["status"] = duplicates_to_add["status"].apply(lambda x: random.choice(["completed", "denied"]) if x == "pending" else x)
            transactions_df = pd.concat([transactions_df, duplicates_to_add], ignore_index=True)

        logging.info("Duplicates added successfully.")
        return transactions_df



In [None]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
from sklearn.datasets import make_regression
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class TransactionGenerator:
    def __init__(self, num_transactions=10000):
        """
        Initialize the TransactionGenerator with a specified number of transactions.
        
        :param num_transactions: Number of transactions to generate.
        """
        self.num_transactions = num_transactions

    def generate_transactions(self, customers_df, branches_df, inactive_customer_ratio=0.1):
        """
        Generate transactions based on customer and branch data, with some customers marked as inactive.

        :param customers_df: DataFrame containing customer information.
        :param branches_df: DataFrame containing branch information.
        :param inactive_customer_ratio: Ratio of customers to be marked as inactive.
        :return: DataFrame containing generated transactions.
        """
        try:
            # customer_ids = customers_df["customer_id"].tolist()
            join_dates = pd.to_datetime(customers_df["join_date"])
            branch_ids = branches_df["branch_id"].tolist()
            branch_currencies = branches_df.set_index("branch_id")["currency"].to_dict()

            transaction_ids = [f"T{index:04d}" for index in range(5000, 5000+self.num_transactions)]
            X, y = make_regression(n_samples=self.num_transactions, n_features=1, noise=10)
            amounts = np.clip(np.abs(y), 1, 10000)

            num_outliers = int(0.05 * len(amounts))
            outliers_indices = np.random.choice(len(amounts), num_outliers, replace=False)
            amounts[outliers_indices] = np.random.uniform(10000, 100000, size=num_outliers)
            amounts = np.round(amounts, 2)

            num_completed = int(0.85 * self.num_transactions)
            num_pending = int(0.1 * self.num_transactions)
            num_denied = int(0.05 * self.num_transactions)

            status_values = ["completed"] * num_completed + ["pending"] * num_pending + ["denied"] * num_denied
            random.shuffle(status_values)
            customer_ids = customers_df["customer_id"].tolist()
            num_inactive_customers = int(len(customer_ids) * inactive_customer_ratio)
            inactive_customers = random.sample(customer_ids, num_inactive_customers)

            customers_df["is_active"] = customers_df["customer_id"].apply(lambda x: x not in inactive_customers)
            inactive_cutoff_date = pd.Timestamp("2024-05-30")

            transaction_data = []
            previous_transaction_timestamp = pd.Timestamp("2018-01-01")

            for idx in range(self.num_transactions):
                transaction_date = previous_transaction_timestamp + timedelta(minutes=random.randint(1, 60))
                transaction_timestamp = transaction_date + timedelta(minutes=random.randint(0, 60))
                previous_transaction_timestamp = transaction_timestamp

                eligible_customers = customers_df[
                    (customers_df["join_date"] <= transaction_timestamp) &
                    (customers_df["is_active"] | (transaction_timestamp <= inactive_cutoff_date))
                ]

                if not eligible_customers.empty:
                    customer = eligible_customers.sample(n=1).iloc[0]
                    customer_id = customer["customer_id"]
                    branch_id = random.choice(branch_ids)
                    branch_currency = branch_currencies[branch_id]

                    if transaction_timestamp.weekday() < 5 and 9 <= transaction_timestamp.hour < 17:
                        channel = random.choice(["ATM", "web", "mobile", "branch"])
                    else:
                        channel = random.choice(["ATM", "web", "mobile"])

                    if 0 <= transaction_timestamp.hour < 6:
                        transaction_type = random.choice(["withdrawal", "deposit"])
                    elif 6 <= transaction_timestamp.hour < 12:
                        transaction_type = random.choice(["payment", "deposit"])
                    elif 12 <= transaction_timestamp.hour < 18:
                        transaction_type = random.choice(["transfer", "withdrawal"])
                    else:
                        transaction_type = random.choice(["payment", "withdrawal"])

                    transaction_data.append({
                        "transaction_id": transaction_ids[idx],
                        "customer_id": customer_id,
                        "branch_id": branch_id,
                        "channel": channel,
                        "transaction_type": transaction_type,
                        "amount": amounts[idx],
                        "currency": branch_currency,
                        "timestamp": transaction_timestamp,
                        "status": status_values[idx]
                    })

            transactions_df = pd.DataFrame(transaction_data)
            logging.info("Transactions generated successfully.")
            return transactions_df

        except KeyError as e:
            logging.error(f"Missing expected column: {e}")
            raise
        except Exception as e:
            logging.error(f"Error generating transactions: {e}")
            raise


    def add_null_values(self, transactions_df):
        """
        Introduce null values into specific columns of the transactions DataFrame.

        :param transactions_df: DataFrame containing transactions.
        :return: DataFrame with null values added.
        """
        columns_with_nulls = ["amount", "currency", "channel"]

        for col in columns_with_nulls:
            null_prob = np.random.uniform(0, 0.03)
            null_indices = np.random.choice(transactions_df.index, int(null_prob * self.num_transactions), replace=False)
            transactions_df.loc[null_indices, col] = None

        transactions_df.loc[transactions_df["amount"].isna(), "status"] = "pending"
        transactions_df.loc[transactions_df["currency"].isna(), "status"] = "pending"
        transactions_df.loc[transactions_df["channel"].isna(), "status"] = "pending"

        logging.info("Null values added successfully.")
        return transactions_df

    def add_duplicates(self, transactions_df):
        """
        Add duplicate transactions to the DataFrame.

        :param transactions_df: DataFrame containing transactions.
        :return: DataFrame with duplicates added.
        """
        num_duplicates = int(self.num_transactions * np.random.uniform(0, 0.02))

        pending_transactions = transactions_df[transactions_df["status"] == "pending"]

        if num_duplicates > 0 and not pending_transactions.empty:
            duplicates_to_add = pending_transactions.sample(n=min(num_duplicates, len(pending_transactions)), replace=True).copy()
            duplicates_to_add["status"] = duplicates_to_add["status"].apply(lambda x: random.choice(["completed", "denied"]) if x == "pending" else x)
            transactions_df = pd.concat([transactions_df, duplicates_to_add], ignore_index=True)

        logging.info("Duplicates added successfully.")
        return transactions_df



In [None]:
data_gen = TransactionGenerator()
transactions_df = data_gen.generate_transactions(customers_df, branches_df)
transactions_df = data_gen.add_null_values(transactions_df)
transactions_df = data_gen.add_duplicates(transactions_df)

2024-07-30 14:04:38,771 - INFO - Transactions generated successfully.
2024-07-30 14:04:38,778 - INFO - Null values added successfully.
2024-07-30 14:04:38,778 - INFO - Duplicates added successfully.


In [None]:
transactions_df

Unnamed: 0,transaction_id,customer_id,branch_id,channel,transaction_type,amount,currency,timestamp,status
0,T5000,C1087,B0012,ATM,withdrawal,11.13,AUD,2018-01-01 01:23:00,completed
1,T5001,C1056,B0003,ATM,withdrawal,9.13,USD,2018-01-01 02:24:00,completed
2,T5002,C1007,B0001,ATM,deposit,8.64,EUR,2018-01-01 03:28:00,completed
3,T5003,C1072,B0004,ATM,deposit,14.74,USD,2018-01-01 04:44:00,completed
4,T5004,C1120,B0014,web,withdrawal,193.38,EUR,2018-01-01 05:36:00,completed
...,...,...,...,...,...,...,...,...,...
10022,T14152,C1113,B0011,mobile,withdrawal,229.75,USD,2019-01-20 20:59:00,denied
10023,T9421,C1150,B0006,web,withdrawal,24.41,AUD,2018-07-05 04:39:00,denied
10024,T8261,C1117,B0002,,withdrawal,62.45,GBP,2018-05-18 04:21:00,completed
10025,T13133,C1266,B0008,ATM,deposit,20.10,AUD,2018-12-09 07:15:00,completed
