#**Install the necessary libraries**

Make sure we have the Python libraries we need to create and change data loaded. Some of the tools we'll use are scikit-learn, pandas, and numpy.

In [None]:
pip install pandas numpy scikit-learn




#**Import Libraries**

First, let's set up our Python environment by importing the necessary libraries.

In [None]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle


# Synthetic Customer Data Generation

Now, we will generate basic synthetic data for 5000 customer records. The features will include:

1. **CustomerID**: A unique identifier for each customer.
2. **Age**: A random age between 18 and 70.
3. **Gender**: A binary categorical feature ('Male', 'Female').
4. **ContractType**: Three types ('Month-to-month', 'One year', 'Two year').
5. **MonthlyCharges** and **TotalCharges**: Randomly generated within realistic ranges.
6. **TechSupport**: Binary feature ('Yes', 'No').
7. **InternetService**: Categorical feature ('DSL', 'Fiber optic', 'No').
8. **Tenure**: Randomly generated between 0 to 72 months.
9. **PaperlessBilling**: Binary feature ('Yes', 'No').
10. **PaymentMethod**: Four types ('Electronic check', 'Mailed check', 'Bank transfer', 'Credit card').
11. **Churn**: Binary target variable ('Yes', 'No') with approximately 20% churn rate.

In [None]:
num_customers = 5000
customer_ids = ['CUST_' + str(i) for i in range(1, num_customers + 1)]

# Age between 18 and 70
ages = np.random.randint(18, 71, num_customers)

genders = np.random.choice(['Male', 'Female'], num_customers)

#  ContractType
contract_types = np.random.choice(['Month-to-month', 'One year', 'Two year'], num_customers, p=[0.6, 0.2, 0.2])

#  MonthlyCharges between $20 and $120
monthly_charges = np.round(np.random.uniform(20, 120, num_customers), 2)

# Tenure between 0 and 72 months
tenure = np.random.randint(0, 73, num_customers)

# Calculate TotalCharges
total_charges = monthly_charges * tenure
total_charges = np.round(total_charges, 2)


tech_support = np.random.choice(['Yes', 'No'], num_customers, p=[0.3, 0.7])


internet_service = np.random.choice(['DSL', 'Fiber optic', 'No'], num_customers, p=[0.4, 0.4, 0.2])


paperless_billing = np.random.choice(['Yes', 'No'], num_customers, p=[0.7, 0.3])


payment_methods = np.random.choice(['Electronic check', 'Mailed check', 'Bank transfer', 'Credit card'], num_customers)

churn = np.random.choice(['Yes', 'No'], num_customers, p=[0.2, 0.8])

df = pd.DataFrame({
    'CustomerID': customer_ids,
    'Age': ages,
    'Gender': genders,
    'ContractType': contract_types,
    'MonthlyCharges': monthly_charges,
    'TotalCharges': total_charges,
    'TechSupport': tech_support,
    'InternetService': internet_service,
    'Tenure': tenure,
    'PaperlessBilling': paperless_billing,
    'PaymentMethod': payment_methods,
    'Churn': churn
})

df = shuffle(df).reset_index(drop=True)

df.head()


Unnamed: 0,CustomerID,Age,Gender,ContractType,MonthlyCharges,TotalCharges,TechSupport,InternetService,Tenure,PaperlessBilling,PaymentMethod,Churn
0,CUST_249,58,Female,Month-to-month,50.95,3617.45,No,No,71,Yes,Bank transfer,No
1,CUST_1890,22,Female,One year,47.01,235.05,Yes,Fiber optic,5,Yes,Electronic check,Yes
2,CUST_1614,62,Female,Month-to-month,22.58,0.0,Yes,Fiber optic,0,Yes,Bank transfer,No
3,CUST_2869,42,Male,Month-to-month,55.08,110.16,No,Fiber optic,2,No,Bank transfer,No
4,CUST_3770,66,Male,One year,43.12,474.32,Yes,DSL,11,Yes,Bank transfer,Yes


#**Save and Download the Dataset**

In [None]:
df.to_csv('customer_churn_dataset.csv', index=False)

# Download the file
from google.colab import files
files.download('customer_churn_dataset.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>