## Synthetic Data

In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification

In [3]:
# Set random seed for reproducibility
np.random.seed(42)

# Generate synthetic dataset with 20 features

n_samples = 1000
n_features = 20

X,y = make_classification(n_samples=n_samples, n_features=n_features, n_classes=2,
                          n_clusters_per_class=1, n_informative=15,
                          random_state=123, n_redundant=5)

# Create pandas Dataframe

cols = ['feat_' + str(i) for i in range(n_features)]

df = pd.DataFrame(X, columns=cols)
df['target'] = y

In [4]:
# Add outlier values

num_outliers = 50
outlier_indices = np.random.choice(n_samples, size = num_outliers, replace =False)

for i in outlier_indices:
    df.iloc[i] = np.random.randint(0,10, size=len(df.columns))
    
# Add missing values

num_missing = 80
missing_indices = np.random.choice(n_samples, size=num_missing, replace=False)
df.iloc[missing_indices] = np.nan

# Shuffle rows

df = df.sample(frac=1).reset_index(drop=True)

print(df.head())

     feat_0    feat_1    feat_2    feat_3    feat_4    feat_5    feat_6  \
0  6.595236  1.485321  2.105253  0.146850  2.186080  0.465362  1.012486   
1 -1.128619  0.023400  3.150784  2.738828  0.891263  0.807017 -0.036696   
2       NaN       NaN       NaN       NaN       NaN       NaN       NaN   
3  9.000000  2.000000  5.000000  9.000000  9.000000  8.000000  0.000000   
4  0.245776  1.237750  0.434717 -6.314769  3.512425 -2.768056  0.334099   

     feat_7    feat_8    feat_9  ...   feat_11   feat_12   feat_13   feat_14  \
0  0.239239  4.174100  2.641864  ...  0.972253  5.718481 -0.842036 -0.724342   
1  4.038607  0.248517  4.131011  ...  0.934878 -2.435948 -2.831492  1.920735   
2       NaN       NaN       NaN  ...       NaN       NaN       NaN       NaN   
3  4.000000  3.000000  7.000000  ...  8.000000  5.000000  3.000000  8.000000   
4 -0.409632  2.676267 -1.091282  ...  0.118257  4.478975  6.368008  0.410060   

    feat_15   feat_16   feat_17   feat_18   feat_19  target  
0 -2.4

In [5]:
file_name = 'churn_data.csv'
df.to_csv(file_name, index=False)

  values = values.astype(str)


## Dummy Dataset

In [7]:
# Generate dummy data
np.random.seed(123) 
n_customers = 5000
data = pd.DataFrame({'CustomerID': range(n_customers)})

# Add dummy features
data['Gender'] = np.random.choice(['Male','Female'], size=n_customers) 
data['Age'] = np.random.randint(18, 65, size=n_customers)
data['Tenure'] = np.random.randint(0, 5, size=n_customers)
data['Balance'] = np.random.normal(5000, 2000, n_customers)
 
# Add target column
data['Target'] = np.random.choice([0,1], size=n_customers, p=[0.7,0.3])

# Add outliers
data.loc[data.sample(50).index, 'Balance'] = np.random.randint(10000, 100000)

# Add missing values
data.loc[data.sample(100).index, 'Age'] = np.nan

# Print first few records
print(data.head())

   CustomerID  Gender   Age  Tenure      Balance  Target
0           0    Male  43.0       4  5739.740763       0
1           1  Female  44.0       3  5117.625168       1
2           2    Male  58.0       0  5781.114931       0
3           3    Male  19.0       4  6298.895016       0
4           4    Male  40.0       1  5972.512224       0


In [8]:
filename = 'churn_dataset.csv'

data.to_csv(filename, index=False)

  values = values.astype(str)
