In [1]:
# %pip --quiet install pyarrow pandas ydata-profiling scikit-learn

# Step 1: Dataset Selection

In [2]:
import pandas as pd

# Load the customer churn dataset
try:
    df = pd.read_csv("dataset/Customer Churn.csv")
    display(df.sample(10))
    display(df.info())
except FileNotFoundError as e:
    print(e)

Unnamed: 0,Call Failure,Complains,Subscription Length,Charge Amount,Seconds of Use,Frequency of use,Frequency of SMS,Distinct Called Numbers,Age Group,Tariff Plan,Status,Age,Customer Value,Churn
2644,5,0,20,1,1400,22,14,2,2,1,1,25,126.99,0
1544,1,0,16,1,1990,30,42,27,2,1,1,25,279.9,0
2117,7,0,40,0,1015,20,20,10,3,1,2,30,121.4,0
1344,2,0,22,0,1650,30,36,18,2,1,1,25,237.6,0
1105,9,0,38,1,3530,79,30,28,3,1,1,30,264.36,0
2587,15,0,38,3,3685,69,203,40,2,1,1,25,1082.43,0
105,8,0,36,0,3715,79,27,26,3,1,1,30,259.76,0
287,11,0,42,1,3040,63,190,29,2,1,1,25,994.635,0
1526,8,0,33,0,3285,45,23,29,2,1,2,25,253.35,1
1283,19,0,25,1,15690,230,0,74,2,1,1,25,716.4,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3150 entries, 0 to 3149
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Call Failure             3150 non-null   int64  
 1   Complains                3150 non-null   int64  
 2   Subscription Length      3150 non-null   int64  
 3   Charge Amount            3150 non-null   int64  
 4   Seconds of Use           3150 non-null   int64  
 5   Frequency of use         3150 non-null   int64  
 6   Frequency of SMS         3150 non-null   int64  
 7   Distinct Called Numbers  3150 non-null   int64  
 8   Age Group                3150 non-null   int64  
 9   Tariff Plan              3150 non-null   int64  
 10  Status                   3150 non-null   int64  
 11  Age                      3150 non-null   int64  
 12  Customer Value           3150 non-null   float64
 13  Churn                    3150 non-null   int64  
dtypes: float64(1), int64(13)

None

# Step 2: Dataset Schema and Storage

In [3]:
import pyarrow as pa
import pyarrow.parquet as pq

# Define the schema based on the exact dataset column names
schema = pa.schema(
    [
        pa.field("Call Failure", pa.int64()),
        pa.field("Complains", pa.int64()),
        pa.field("Subscription Length", pa.int64()),
        pa.field("Charge Amount", pa.int64()),
        pa.field("Seconds of Use", pa.int64()),
        pa.field("Frequency of use", pa.int64()),
        pa.field("Frequency of SMS", pa.int64()),
        pa.field("Distinct Called Numbers", pa.int64()),
        pa.field("Age Group", pa.int64()),
        pa.field("Tariff Plan", pa.int64()),
        pa.field("Status", pa.int64()),
        pa.field("Age", pa.int64()),
        pa.field("Customer Value", pa.float64()),
        pa.field("Churn", pa.int64()),
    ]
)

# Convert the Pandas DataFrame to a PyArrow Table with the defined schema
table = pa.Table.from_pandas(df, schema=schema)

# Path to your Parquet file
PARQUET_PATH = "dataset/customer_churn.parquet"

# Store the table in Parquet format
pq.write_table(table, PARQUET_PATH)

print(f"Dataset schema defined and stored in: {PARQUET_PATH}")


Dataset schema defined and stored in: dataset/customer_churn.parquet


In [4]:
# Read the Parquet file into a Pandas DataFrame
df_parquet = pd.read_parquet(PARQUET_PATH)

# Display the first few rows of the DataFrame
display(df_parquet.sample(10))

Unnamed: 0,Call Failure,Complains,Subscription Length,Charge Amount,Seconds of Use,Frequency of use,Frequency of SMS,Distinct Called Numbers,Age Group,Tariff Plan,Status,Age,Customer Value,Churn
1542,9,0,23,1,1520,36,33,26,2,1,1,25,218.52,0
2021,3,0,35,0,6413,69,101,35,2,1,1,25,746.19,0
1666,0,0,34,0,455,2,0,5,2,1,2,25,20.565,0
2444,5,0,14,1,1740,22,20,10,2,1,1,25,169.29,0
1486,0,0,30,0,1180,30,20,13,3,1,2,30,128.4,0
1074,8,1,34,0,5513,92,19,22,2,1,1,25,337.725,1
70,10,0,39,2,11525,157,16,49,4,2,1,45,332.05,0
297,13,0,22,4,3113,55,46,25,3,1,1,30,310.72,0
516,0,0,37,0,275,4,0,1,2,1,2,25,12.555,0
1442,7,0,26,0,1100,34,23,16,2,1,1,25,154.53,0


# Step 3: Profiling the Dataset

In [5]:
# %pip --quiet install setuptools

In [6]:
import pandas as pd
from ydata_profiling import ProfileReport

# Generate the profile report
profile = ProfileReport(df_parquet, explorative=True)

# Save the report as an HTML file
REPORT_PATH = "customer_churn_profile_report.html"
profile.to_file(REPORT_PATH)

display(f"Profile report generated and saved at: {REPORT_PATH}")


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

'Profile report generated and saved at: customer_churn_profile_report.html'

In [7]:
# %pip --quiet install ipywidgets

In [8]:
# Display the profile report in the notebook
profile.to_notebook_iframe()

# Step 4: Train-Test Split

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Ensure reproducibility
RANDOM_SEED = 42

# Split dataset into Training (60%), Remaining (40%)
train_df, temp_df = train_test_split(
    df_parquet, test_size=0.4, random_state=RANDOM_SEED
)

# Split Remaining dataset into Test (20%) and Production (20%)
test_df, prod_df = train_test_split(temp_df, test_size=0.5, random_state=RANDOM_SEED)

# Save the splits as Parquet files
train_path = "dataset/customer_churn_train.parquet"
test_path = "dataset/customer_churn_test.parquet"
prod_path = "dataset/customer_churn_prod.parquet"

train_df.to_parquet(train_path, index=False)
test_df.to_parquet(test_path, index=False)
prod_df.to_parquet(prod_path, index=False)

print(
    f"Train, Test, and Production datasets saved successfully:\n"
    f"Training Set: {train_path}\n"
    f"Test Set: {test_path}\n"
    f"Production Set: {prod_path}"
)


Train, Test, and Production datasets saved successfully:
Training Set: dataset/customer_churn_train.parquet
Test Set: dataset/customer_churn_test.parquet
Production Set: dataset/customer_churn_prod.parquet
