Creating Synthetic DataSet

In [1]:
import numpy as np
import pandas as pd

Defining number of data points required for analysis

In [2]:
#Seed for reproducibility
np.random.seed(42)

#Defining Constant for Data generation
n_customers=10000

In [3]:
#Generating synthetic customer data
Customer_Id=np.arange(1,n_customers+1)
Age=np.random.randint(20,65,n_customers)
Income=np.random.randint(20000,200000,n_customers)
Credit_Score=np.random.randint(300,850,n_customers)
Region=np.random.choice(['Urban','Rural','Semi-urban'],n_customers)
Gender=np.random.choice(['Male','Female'],n_customers)
Loan_Amount=np.random.randint(5000,500000,n_customers)
Loan_Tenure=np.random.randint(1,12,n_customers)
Default=np.random.choice([0,1],n_customers,p=[0.9,0.1])

In [4]:
#Creating Dataframe
Df_Customers=pd.DataFrame({'Customer_Id':Customer_Id,'Age':Age,'Income':Income,'Credit_Score':Credit_Score,
                           'Region':Region,'Gender':Gender,'Loan_Amount':Loan_Amount,'Loan_Tenure':Loan_Tenure,'Default':Default})

In [5]:
#Displaying Sample Data
Df_Customers.head()

Unnamed: 0,Customer_Id,Age,Income,Credit_Score,Region,Gender,Loan_Amount,Loan_Tenure,Default
0,1,58,78182,351,Semi-urban,Female,259581,8,0
1,2,48,153873,411,Urban,Male,226689,7,0
2,3,34,190821,594,Urban,Female,94217,11,1
3,4,62,75654,488,Urban,Female,414806,7,0
4,5,27,75381,364,Urban,Male,49391,7,0


In [6]:
#Adding new columns in existing data for further analysis
Df_Customers['Repayment_time']=np.random.randint(1,12,n_customers)
Df_Customers['Loan_purpose']=np.random.choice(['Personal','Business','Education','Medical'],n_customers)

#Adding Gold Price Fluctuation data
Df_Customers['Gold_price_fluctuation']=np.round(np.random.uniform(-10,10,n_customers),2)
Df_Customers['Gold_purity']=np.round(np.random.uniform(18,24,n_customers),2)

#Adding operational efficiency-related data
Df_Customers['Branch_location']=np.random.choice(['Urban','Rural','Semi_urban'],n_customers)
Df_Customers['Processing_time']=np.random.randint(10,30,n_customers) #in minutes
Df_Customers['Responded_to_promotion']=np.random.choice([0,1],n_customers)
Df_Customers['Branch_size']=np.random.randint(3,10,n_customers) #number of employees
Df_Customers['Turnaround_time']=np.random.randint(1,48,n_customers) #in hours

In [7]:
Df_Customers.head(10)

Unnamed: 0,Customer_Id,Age,Income,Credit_Score,Region,Gender,Loan_Amount,Loan_Tenure,Default,Repayment_time,Loan_purpose,Gold_price_fluctuation,Gold_purity,Branch_location,Processing_time,Responded_to_promotion,Branch_size,Turnaround_time
0,1,58,78182,351,Semi-urban,Female,259581,8,0,9,Medical,-1.09,23.25,Semi_urban,26,0,4,46
1,2,48,153873,411,Urban,Male,226689,7,0,7,Education,-0.17,18.29,Rural,19,1,3,13
2,3,34,190821,594,Urban,Female,94217,11,1,1,Medical,6.83,20.13,Semi_urban,22,0,4,21
3,4,62,75654,488,Urban,Female,414806,7,0,9,Medical,6.38,23.16,Semi_urban,20,1,8,45
4,5,27,75381,364,Urban,Male,49391,7,0,6,Education,-9.95,21.73,Rural,10,1,3,43
5,6,40,78601,764,Urban,Female,434114,5,0,9,Business,-1.72,22.63,Urban,16,0,5,21
6,7,58,166154,557,Urban,Male,85991,8,0,2,Education,3.12,19.94,Rural,14,0,9,30
7,8,38,165720,630,Rural,Female,23238,4,0,11,Business,7.05,20.22,Rural,14,0,8,27
8,9,42,20364,800,Semi-urban,Male,452946,5,0,7,Business,-9.02,23.33,Urban,10,0,6,6
9,10,30,185640,509,Rural,Male,174104,3,0,8,Education,2.56,22.5,Rural,11,0,6,30


In [8]:
Df_Customers.shape

(10000, 18)

In [9]:
Df_Customers.columns

Index(['Customer_Id', 'Age', 'Income', 'Credit_Score', 'Region', 'Gender',
       'Loan_Amount', 'Loan_Tenure', 'Default', 'Repayment_time',
       'Loan_purpose', 'Gold_price_fluctuation', 'Gold_purity',
       'Branch_location', 'Processing_time', 'Responded_to_promotion',
       'Branch_size', 'Turnaround_time'],
      dtype='object')

In [10]:
Df_Customers.isnull().sum()

Customer_Id               0
Age                       0
Income                    0
Credit_Score              0
Region                    0
Gender                    0
Loan_Amount               0
Loan_Tenure               0
Default                   0
Repayment_time            0
Loan_purpose              0
Gold_price_fluctuation    0
Gold_purity               0
Branch_location           0
Processing_time           0
Responded_to_promotion    0
Branch_size               0
Turnaround_time           0
dtype: int64

In [11]:
Df_Customers.duplicated().sum()

0

In [12]:
# Saving Df_Customers DataFrame to CSV
Df_Customers.to_csv('Df_Customers.csv', index=False)
print("Df_Customers DataFrame saved to 'Df_Customers.csv'")


Df_Customers DataFrame saved to 'Df_Customers.csv'
