Step 1 & STep 2


In [3]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

# Load the customer churn dataset
try:
    df = pd.read_csv('/content/Customer Churn.csv')
except FileNotFoundError:
    print("Error: 'Customer Churn.csv' not found. Please upload the file or provide the correct path.")
    exit()

# Define the schema based on the exact dataset column names
schema = pa.schema([
    pa.field('Call  Failure', pa.int64()),
    pa.field('Complains', pa.int64()),
    pa.field('Subscription  Length', pa.int64()),
    pa.field('Charge  Amount', pa.int64()),
    pa.field('Seconds of Use', pa.int64()),
    pa.field('Frequency of use', pa.int64()),
    pa.field('Frequency of SMS', pa.int64()),
    pa.field('Distinct Called Numbers', pa.int64()),
    pa.field('Age Group', pa.int64()),
    pa.field('Tariff Plan', pa.int64()),
    pa.field('Status', pa.int64()),
    pa.field('Age', pa.int64()),
    pa.field('Customer Value', pa.float64()),
    pa.field('Churn', pa.int64())
])

# Convert the Pandas DataFrame to a PyArrow Table with the defined schema
table = pa.Table.from_pandas(df, schema=schema)

# Store the table in Parquet format
parquet_path = "customer_churn.parquet"  # Changed to just the filename
pq.write_table(table, parquet_path)

print(f"Dataset schema defined and stored in: {parquet_path}")



Dataset schema defined and stored in: customer_churn.parquet


In [5]:

# Path to your Parquet file
parquet_path = "customer_churn.parquet"

# Read the Parquet file into a Pandas DataFrame
df = pd.read_parquet(parquet_path)

# Display the first few rows of the DataFrame
print(df.head())

   Call  Failure  Complains  Subscription  Length  Charge  Amount  \
0              8          0                    38               0   
1              0          0                    39               0   
2             10          0                    37               0   
3             10          0                    38               0   
4              3          0                    38               0   

   Seconds of Use  Frequency of use  Frequency of SMS  \
0            4370                71                 5   
1             318                 5                 7   
2            2453                60               359   
3            4198                66                 1   
4            2393                58                 2   

   Distinct Called Numbers  Age Group  Tariff Plan  Status  Age  \
0                       17          3            1       1   30   
1                        4          2            1       2   25   
2                       24          3    

Step 3


In [7]:
!pip install ydata-profiling

Collecting ydata-profiling
  Downloading ydata_profiling-4.14.0-py2.py3-none-any.whl.metadata (22 kB)
Collecting visions<0.8.2,>=0.7.5 (from visions[type_image_path]<0.8.2,>=0.7.5->ydata-profiling)
  Downloading visions-0.8.1-py3-none-any.whl.metadata (11 kB)
Collecting htmlmin==0.1.12 (from ydata-profiling)
  Downloading htmlmin-0.1.12.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting phik<0.13,>=0.11.1 (from ydata-profiling)
  Downloading phik-0.12.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Collecting multimethod<2,>=1.4 (from ydata-profiling)
  Downloading multimethod-1.12-py3-none-any.whl.metadata (9.6 kB)
Collecting imagehash==4.3.1 (from ydata-profiling)
  Downloading ImageHash-4.3.1-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting dacite>=1.8 (from ydata-profiling)
  Downloading dacite-1.9.2-py3-none-any.whl.metadata (17 kB)
Collecting PyWavelets (from imagehash==4.3.1->ydata-profiling)
  Downloading pywavelets-1.

In [8]:
import pandas as pd
from ydata_profiling import ProfileReport

# Load the dataset from Parquet file
parquet_path = "/content/customer_churn.parquet"
df = pd.read_parquet(parquet_path)

# Generate the profile report
profile = ProfileReport(df, explorative=True)

# Save the report as an HTML file
report_path = "customer_churn_profile_report.html"
profile.to_file(report_path)

print(f"Profile report generated and saved at: {report_path}")


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Profile report generated and saved at: customer_churn_profile_report.html


In [9]:
from IPython.core.display import display, HTML

# Load the report and display it inline
report_path = "/content/customer_churn_profile_report.html"
display(HTML(filename=report_path))


0,1
Number of variables,14
Number of observations,3150
Missing cells,0
Missing cells (%),0.0%
Duplicate rows,165
Duplicate rows (%),5.2%
Total size in memory,344.7 KiB
Average record size in memory,112.0 B

0,1
Numeric,8
Categorical,6

0,1
Dataset has 165 (5.2%) duplicate rows,Duplicates
Age is highly overall correlated with Age Group,High correlation
Age Group is highly overall correlated with Age,High correlation
Call Failure is highly overall correlated with Charge Amount and 2 other fields,High correlation
Charge Amount is highly overall correlated with Call Failure,High correlation
Churn is highly overall correlated with Complains,High correlation
Complains is highly overall correlated with Churn,High correlation
Customer Value is highly overall correlated with Distinct Called Numbers and 3 other fields,High correlation
Distinct Called Numbers is highly overall correlated with Call Failure and 3 other fields,High correlation
Frequency of SMS is highly overall correlated with Customer Value,High correlation

0,1
Analysis started,2025-03-15 09:17:05.973319
Analysis finished,2025-03-15 09:17:20.843519
Duration,14.87 seconds
Software version,ydata-profiling vv4.14.0
Download configuration,config.json

0,1
Distinct,37
Distinct (%),1.2%
Missing,0
Missing (%),0.0%
Infinite,0
Infinite (%),0.0%
Mean,7.6279365

0,1
Minimum,0
Maximum,36
Zeros,702
Zeros (%),22.3%
Negative,0
Negative (%),0.0%
Memory size,24.7 KiB

0,1
Minimum,0
5-th percentile,0
Q1,1
median,6
Q3,12
95-th percentile,22
Maximum,36
Range,36
Interquartile range (IQR),11

0,1
Standard deviation,7.2638856
Coefficient of variation (CV),0.95227399
Kurtosis,0.90682067
Mean,7.6279365
Median Absolute Deviation (MAD),5
Skewness,1.0897518
Sum,24028
Variance,52.764034
Monotonicity,Not monotonic

Value,Count,Frequency (%)
0,702,22.3%
5,244,7.7%
7,166,5.3%
6,161,5.1%
8,156,5.0%
9,149,4.7%
3,141,4.5%
2,137,4.3%
4,133,4.2%
11,125,4.0%

Value,Count,Frequency (%)
0,702,22.3%
1,121,3.8%
2,137,4.3%
3,141,4.5%
4,133,4.2%
5,244,7.7%
6,161,5.1%
7,166,5.3%
8,156,5.0%
9,149,4.7%

Value,Count,Frequency (%)
36,2,0.1%
35,2,0.1%
34,3,0.1%
33,3,0.1%
32,8,0.3%
31,6,0.2%
30,16,0.5%
29,7,0.2%
28,17,0.5%
27,13,0.4%

0,1
Distinct,2
Distinct (%),0.1%
Missing,0
Missing (%),0.0%
Memory size,178.5 KiB

0,1
0,2909
1,241

0,1
Max length,1
Median length,1
Mean length,1
Min length,1

0,1
Total characters,3150
Distinct characters,2
Distinct categories,1 ?
Distinct scripts,1 ?
Distinct blocks,1 ?

0,1
Unique,0 ?
Unique (%),0.0%

0,1
1st row,0
2nd row,0
3rd row,0
4th row,0
5th row,0

Value,Count,Frequency (%)
0,2909,92.3%
1,241,7.7%

Value,Count,Frequency (%)
0,2909,92.3%
1,241,7.7%

Value,Count,Frequency (%)
0,2909,92.3%
1,241,7.7%

Value,Count,Frequency (%)
(unknown),3150,100.0%

Value,Count,Frequency (%)
0,2909,92.3%
1,241,7.7%

Value,Count,Frequency (%)
(unknown),3150,100.0%

Value,Count,Frequency (%)
0,2909,92.3%
1,241,7.7%

Value,Count,Frequency (%)
(unknown),3150,100.0%

Value,Count,Frequency (%)
0,2909,92.3%
1,241,7.7%

0,1
Distinct,45
Distinct (%),1.4%
Missing,0
Missing (%),0.0%
Infinite,0
Infinite (%),0.0%
Mean,32.541905

0,1
Minimum,3
Maximum,47
Zeros,0
Zeros (%),0.0%
Negative,0
Negative (%),0.0%
Memory size,24.7 KiB

0,1
Minimum,3
5-th percentile,13
Q1,30
median,35
Q3,38
95-th percentile,42
Maximum,47
Range,44
Interquartile range (IQR),8

0,1
Standard deviation,8.5734821
Coefficient of variation (CV),0.26345975
Kurtosis,1.2158424
Mean,32.541905
Median Absolute Deviation (MAD),4
Skewness,-1.300015
Sum,102507
Variance,73.504595
Monotonicity,Not monotonic

Value,Count,Frequency (%)
36,276,8.8%
38,258,8.2%
37,229,7.3%
35,228,7.2%
34,201,6.4%
39,201,6.4%
40,186,5.9%
33,152,4.8%
32,121,3.8%
41,110,3.5%

Value,Count,Frequency (%)
3,8,0.3%
4,4,0.1%
5,6,0.2%
6,8,0.3%
7,19,0.6%
8,12,0.4%
9,22,0.7%
10,16,0.5%
11,26,0.8%
12,19,0.6%

Value,Count,Frequency (%)
47,1,< 0.1%
46,13,0.4%
45,23,0.7%
44,44,1.4%
43,56,1.8%
42,80,2.5%
41,110,3.5%
40,186,5.9%
39,201,6.4%
38,258,8.2%

0,1
Distinct,11
Distinct (%),0.3%
Missing,0
Missing (%),0.0%
Infinite,0
Infinite (%),0.0%
Mean,0.94285714

0,1
Minimum,0
Maximum,10
Zeros,1768
Zeros (%),56.1%
Negative,0
Negative (%),0.0%
Memory size,24.7 KiB

0,1
Minimum,0
5-th percentile,0
Q1,0
median,0
Q3,1
95-th percentile,4
Maximum,10
Range,10
Interquartile range (IQR),1

0,1
Standard deviation,1.5210719
Coefficient of variation (CV),1.6132581
Kurtosis,8.8543583
Mean,0.94285714
Median Absolute Deviation (MAD),0
Skewness,2.5848682
Sum,2970
Variance,2.3136597
Monotonicity,Not monotonic

Value,Count,Frequency (%)
0,1768,56.1%
1,617,19.6%
2,395,12.5%
3,199,6.3%
4,76,2.4%
5,30,1.0%
8,19,0.6%
9,14,0.4%
7,14,0.4%
6,11,0.3%

Value,Count,Frequency (%)
0,1768,56.1%
1,617,19.6%
2,395,12.5%
3,199,6.3%
4,76,2.4%
5,30,1.0%
6,11,0.3%
7,14,0.4%
8,19,0.6%
9,14,0.4%

Value,Count,Frequency (%)
10,7,0.2%
9,14,0.4%
8,19,0.6%
7,14,0.4%
6,11,0.3%
5,30,1.0%
4,76,2.4%
3,199,6.3%
2,395,12.5%
1,617,19.6%

0,1
Distinct,1756
Distinct (%),55.7%
Missing,0
Missing (%),0.0%
Infinite,0
Infinite (%),0.0%
Mean,4472.4597

0,1
Minimum,0
Maximum,17090
Zeros,154
Zeros (%),4.9%
Negative,0
Negative (%),0.0%
Memory size,24.7 KiB

0,1
Minimum,0.0
5-th percentile,54.5
Q1,1391.25
median,2990.0
Q3,6478.25
95-th percentile,15020.5
Maximum,17090.0
Range,17090.0
Interquartile range (IQR),5087.0

0,1
Standard deviation,4197.9087
Coefficient of variation (CV),0.93861297
Kurtosis,0.99367573
Mean,4472.4597
Median Absolute Deviation (MAD),1996
Skewness,1.3219429
Sum,14088248
Variance,17622437
Monotonicity,Not monotonic

Value,Count,Frequency (%)
0,154,4.9%
305,37,1.2%
710,9,0.3%
1015,9,0.3%
1973,9,0.3%
2088,9,0.3%
1360,8,0.3%
825,8,0.3%
955,8,0.3%
1180,8,0.3%

Value,Count,Frequency (%)
0,154,4.9%
8,1,< 0.1%
13,1,< 0.1%
33,1,< 0.1%
50,1,< 0.1%
60,1,< 0.1%
73,1,< 0.1%
80,1,< 0.1%
88,1,< 0.1%
93,2,0.1%

Value,Count,Frequency (%)
17090,1,< 0.1%
16980,1,< 0.1%
16785,1,< 0.1%
16675,1,< 0.1%
16640,1,< 0.1%
16570,1,< 0.1%
16560,1,< 0.1%
16500,1,< 0.1%
16495,1,< 0.1%
16480,1,< 0.1%

0,1
Distinct,242
Distinct (%),7.7%
Missing,0
Missing (%),0.0%
Infinite,0
Infinite (%),0.0%
Mean,69.460635

0,1
Minimum,0
Maximum,255
Zeros,154
Zeros (%),4.9%
Negative,0
Negative (%),0.0%
Memory size,24.7 KiB

0,1
Minimum,0.0
5-th percentile,1.0
Q1,27.0
median,54.0
Q3,95.0
95-th percentile,184.55
Maximum,255.0
Range,255.0
Interquartile range (IQR),68.0

0,1
Standard deviation,57.413308
Coefficient of variation (CV),0.82655893
Kurtosis,0.82012484
Mean,69.460635
Median Absolute Deviation (MAD),33
Skewness,1.1441664
Sum,218801
Variance,3296.2879
Monotonicity,Not monotonic

Value,Count,Frequency (%)
0,154,4.9%
6,49,1.6%
44,38,1.2%
39,37,1.2%
41,35,1.1%
33,33,1.0%
36,33,1.0%
25,32,1.0%
47,32,1.0%
45,32,1.0%

Value,Count,Frequency (%)
0,154,4.9%
1,9,0.3%
2,15,0.5%
3,4,0.1%
4,23,0.7%
5,15,0.5%
6,49,1.6%
7,19,0.6%
8,25,0.8%
9,16,0.5%

Value,Count,Frequency (%)
255,1,< 0.1%
254,2,0.1%
252,1,< 0.1%
250,2,0.1%
249,1,< 0.1%
248,2,0.1%
247,1,< 0.1%
246,2,0.1%
245,1,< 0.1%
244,5,0.2%

0,1
Distinct,405
Distinct (%),12.9%
Missing,0
Missing (%),0.0%
Infinite,0
Infinite (%),0.0%
Mean,73.174921

0,1
Minimum,0
Maximum,522
Zeros,603
Zeros (%),19.1%
Negative,0
Negative (%),0.0%
Memory size,24.7 KiB

0,1
Minimum,0.0
5-th percentile,0.0
Q1,6.0
median,21.0
Q3,87.0
95-th percentile,356.55
Maximum,522.0
Range,522.0
Interquartile range (IQR),81.0

0,1
Standard deviation,112.23756
Coefficient of variation (CV),1.5338255
Kurtosis,3.2585401
Mean,73.174921
Median Absolute Deviation (MAD),21
Skewness,1.9741418
Sum,230501
Variance,12597.27
Monotonicity,Not monotonic

Value,Count,Frequency (%)
0,603,19.1%
7,194,6.2%
9,54,1.7%
15,54,1.7%
8,54,1.7%
16,51,1.6%
17,47,1.5%
10,44,1.4%
12,42,1.3%
1,41,1.3%

Value,Count,Frequency (%)
0,603,19.1%
1,41,1.3%
2,39,1.2%
3,32,1.0%
4,30,1.0%
5,29,0.9%
6,26,0.8%
7,194,6.2%
8,54,1.7%
9,54,1.7%

Value,Count,Frequency (%)
522,1,< 0.1%
515,1,< 0.1%
511,1,< 0.1%
508,1,< 0.1%
505,1,< 0.1%
504,1,< 0.1%
501,1,< 0.1%
500,1,< 0.1%
499,1,< 0.1%
498,3,0.1%

0,1
Distinct,92
Distinct (%),2.9%
Missing,0
Missing (%),0.0%
Infinite,0
Infinite (%),0.0%
Mean,23.509841

0,1
Minimum,0
Maximum,97
Zeros,154
Zeros (%),4.9%
Negative,0
Negative (%),0.0%
Memory size,24.7 KiB

0,1
Minimum,0
5-th percentile,1
Q1,10
median,21
Q3,34
95-th percentile,51
Maximum,97
Range,97
Interquartile range (IQR),24

0,1
Standard deviation,17.217337
Coefficient of variation (CV),0.73234597
Kurtosis,1.3599904
Mean,23.509841
Median Absolute Deviation (MAD),11
Skewness,1.0294021
Sum,74056
Variance,296.43671
Monotonicity,Not monotonic

Value,Count,Frequency (%)
0,154,4.9%
2,88,2.8%
10,78,2.5%
15,77,2.4%
6,76,2.4%
17,76,2.4%
20,75,2.4%
19,75,2.4%
8,75,2.4%
16,74,2.3%

Value,Count,Frequency (%)
0,154,4.9%
1,31,1.0%
2,88,2.8%
3,44,1.4%
4,63,2.0%
5,60,1.9%
6,76,2.4%
7,61,1.9%
8,75,2.4%
9,73,2.3%

Value,Count,Frequency (%)
97,1,< 0.1%
95,1,< 0.1%
93,1,< 0.1%
88,1,< 0.1%
87,1,< 0.1%
86,3,0.1%
85,3,0.1%
84,4,0.1%
83,4,0.1%
82,8,0.3%

0,1
Distinct,5
Distinct (%),0.2%
Missing,0
Missing (%),0.0%
Memory size,178.5 KiB

0,1
3,1425
2,1037
4,395
5,170
1,123

0,1
Max length,1
Median length,1
Mean length,1
Min length,1

0,1
Total characters,3150
Distinct characters,5
Distinct categories,1 ?
Distinct scripts,1 ?
Distinct blocks,1 ?

0,1
Unique,0 ?
Unique (%),0.0%

0,1
1st row,3
2nd row,2
3rd row,3
4th row,1
5th row,1

Value,Count,Frequency (%)
3,1425,45.2%
2,1037,32.9%
4,395,12.5%
5,170,5.4%
1,123,3.9%

Value,Count,Frequency (%)
3,1425,45.2%
2,1037,32.9%
4,395,12.5%
5,170,5.4%
1,123,3.9%

Value,Count,Frequency (%)
3,1425,45.2%
2,1037,32.9%
4,395,12.5%
5,170,5.4%
1,123,3.9%

Value,Count,Frequency (%)
(unknown),3150,100.0%

Value,Count,Frequency (%)
3,1425,45.2%
2,1037,32.9%
4,395,12.5%
5,170,5.4%
1,123,3.9%

Value,Count,Frequency (%)
(unknown),3150,100.0%

Value,Count,Frequency (%)
3,1425,45.2%
2,1037,32.9%
4,395,12.5%
5,170,5.4%
1,123,3.9%

Value,Count,Frequency (%)
(unknown),3150,100.0%

Value,Count,Frequency (%)
3,1425,45.2%
2,1037,32.9%
4,395,12.5%
5,170,5.4%
1,123,3.9%

0,1
Distinct,2
Distinct (%),0.1%
Missing,0
Missing (%),0.0%
Memory size,178.5 KiB

0,1
1,2905
2,245

0,1
Max length,1
Median length,1
Mean length,1
Min length,1

0,1
Total characters,3150
Distinct characters,2
Distinct categories,1 ?
Distinct scripts,1 ?
Distinct blocks,1 ?

0,1
Unique,0 ?
Unique (%),0.0%

0,1
1st row,1
2nd row,1
3rd row,1
4th row,1
5th row,1

Value,Count,Frequency (%)
1,2905,92.2%
2,245,7.8%

Value,Count,Frequency (%)
1,2905,92.2%
2,245,7.8%

Value,Count,Frequency (%)
1,2905,92.2%
2,245,7.8%

Value,Count,Frequency (%)
(unknown),3150,100.0%

Value,Count,Frequency (%)
1,2905,92.2%
2,245,7.8%

Value,Count,Frequency (%)
(unknown),3150,100.0%

Value,Count,Frequency (%)
1,2905,92.2%
2,245,7.8%

Value,Count,Frequency (%)
(unknown),3150,100.0%

Value,Count,Frequency (%)
1,2905,92.2%
2,245,7.8%

0,1
Distinct,2
Distinct (%),0.1%
Missing,0
Missing (%),0.0%
Memory size,178.5 KiB

0,1
1,2368
2,782

0,1
Max length,1
Median length,1
Mean length,1
Min length,1

0,1
Total characters,3150
Distinct characters,2
Distinct categories,1 ?
Distinct scripts,1 ?
Distinct blocks,1 ?

0,1
Unique,0 ?
Unique (%),0.0%

0,1
1st row,1
2nd row,2
3rd row,1
4th row,1
5th row,1

Value,Count,Frequency (%)
1,2368,75.2%
2,782,24.8%

Value,Count,Frequency (%)
1,2368,75.2%
2,782,24.8%

Value,Count,Frequency (%)
1,2368,75.2%
2,782,24.8%

Value,Count,Frequency (%)
(unknown),3150,100.0%

Value,Count,Frequency (%)
1,2368,75.2%
2,782,24.8%

Value,Count,Frequency (%)
(unknown),3150,100.0%

Value,Count,Frequency (%)
1,2368,75.2%
2,782,24.8%

Value,Count,Frequency (%)
(unknown),3150,100.0%

Value,Count,Frequency (%)
1,2368,75.2%
2,782,24.8%

0,1
Distinct,5
Distinct (%),0.2%
Missing,0
Missing (%),0.0%
Memory size,181.6 KiB

0,1
30,1425
25,1037
45,395
55,170
15,123

0,1
Max length,2
Median length,2
Mean length,2
Min length,2

0,1
Total characters,6300
Distinct characters,6
Distinct categories,1 ?
Distinct scripts,1 ?
Distinct blocks,1 ?

0,1
Unique,0 ?
Unique (%),0.0%

0,1
1st row,30
2nd row,25
3rd row,30
4th row,15
5th row,15

Value,Count,Frequency (%)
30,1425,45.2%
25,1037,32.9%
45,395,12.5%
55,170,5.4%
15,123,3.9%

Value,Count,Frequency (%)
30,1425,45.2%
25,1037,32.9%
45,395,12.5%
55,170,5.4%
15,123,3.9%

Value,Count,Frequency (%)
5,1895,30.1%
3,1425,22.6%
0,1425,22.6%
2,1037,16.5%
4,395,6.3%
1,123,2.0%

Value,Count,Frequency (%)
(unknown),6300,100.0%

Value,Count,Frequency (%)
5,1895,30.1%
3,1425,22.6%
0,1425,22.6%
2,1037,16.5%
4,395,6.3%
1,123,2.0%

Value,Count,Frequency (%)
(unknown),6300,100.0%

Value,Count,Frequency (%)
5,1895,30.1%
3,1425,22.6%
0,1425,22.6%
2,1037,16.5%
4,395,6.3%
1,123,2.0%

Value,Count,Frequency (%)
(unknown),6300,100.0%

Value,Count,Frequency (%)
5,1895,30.1%
3,1425,22.6%
0,1425,22.6%
2,1037,16.5%
4,395,6.3%
1,123,2.0%

0,1
Distinct,2654
Distinct (%),84.3%
Missing,0
Missing (%),0.0%
Infinite,0
Infinite (%),0.0%
Mean,470.97292

0,1
Minimum,0
Maximum,2165.28
Zeros,132
Zeros (%),4.2%
Negative,0
Negative (%),0.0%
Memory size,24.7 KiB

0,1
Minimum,0.0
5-th percentile,10.335
Q1,113.80125
median,228.48
Q3,788.38875
95-th percentile,1587.68
Maximum,2165.28
Range,2165.28
Interquartile range (IQR),674.5875

0,1
Standard deviation,517.01543
Coefficient of variation (CV),1.0977604
Kurtosis,1.2244965
Mean,470.97292
Median Absolute Deviation (MAD),160.5825
Skewness,1.4272916
Sum,1483564.7
Variance,267304.96
Monotonicity,Not monotonic

Value,Count,Frequency (%)
0,132,4.2%
45.495,11,0.3%
40.44,10,0.3%
15.165,6,0.2%
25.275,5,0.2%
121.4,4,0.1%
180,4,0.1%
1538.145,4,0.1%
131.4,4,0.1%
197.64,3,0.1%

Value,Count,Frequency (%)
0.0,132,4.2%
2.34,1,< 0.1%
4.0,1,< 0.1%
4.41,1,< 0.1%
4.5,2,0.1%
5.13,1,< 0.1%
5.175,1,< 0.1%
5.4,3,0.1%
5.625,1,< 0.1%
5.94,1,< 0.1%

Value,Count,Frequency (%)
2165.28,1,< 0.1%
2149.28,1,< 0.1%
2148.84,1,< 0.1%
2148.03,1,< 0.1%
2140.96,1,< 0.1%
2129.535,1,< 0.1%
2127.68,1,< 0.1%
2124.84,1,< 0.1%
2120.67,1,< 0.1%
2117.72,1,< 0.1%

0,1
Distinct,2
Distinct (%),0.1%
Missing,0
Missing (%),0.0%
Memory size,178.5 KiB

0,1
0,2655
1,495

0,1
Max length,1
Median length,1
Mean length,1
Min length,1

0,1
Total characters,3150
Distinct characters,2
Distinct categories,1 ?
Distinct scripts,1 ?
Distinct blocks,1 ?

0,1
Unique,0 ?
Unique (%),0.0%

0,1
1st row,0
2nd row,0
3rd row,0
4th row,0
5th row,0

Value,Count,Frequency (%)
0,2655,84.3%
1,495,15.7%

Value,Count,Frequency (%)
0,2655,84.3%
1,495,15.7%

Value,Count,Frequency (%)
0,2655,84.3%
1,495,15.7%

Value,Count,Frequency (%)
(unknown),3150,100.0%

Value,Count,Frequency (%)
0,2655,84.3%
1,495,15.7%

Value,Count,Frequency (%)
(unknown),3150,100.0%

Value,Count,Frequency (%)
0,2655,84.3%
1,495,15.7%

Value,Count,Frequency (%)
(unknown),3150,100.0%

Value,Count,Frequency (%)
0,2655,84.3%
1,495,15.7%

Unnamed: 0,Age,Age Group,Call Failure,Charge Amount,Churn,Complains,Customer Value,Distinct Called Numbers,Frequency of SMS,Frequency of use,Seconds of Use,Status,Subscription Length,Tariff Plan
Age,1.0,1.0,0.103,0.236,0.132,0.069,0.214,0.241,0.173,0.235,0.297,0.197,0.162,0.192
Age Group,1.0,1.0,0.103,0.236,0.132,0.069,0.214,0.241,0.173,0.235,0.297,0.197,0.162,0.192
Call Failure,0.103,0.103,1.0,0.572,0.035,0.17,0.346,0.514,0.27,0.55,0.466,0.121,0.249,0.224
Charge Amount,0.236,0.236,0.572,1.0,0.176,0.063,0.393,0.437,0.319,0.447,0.49,0.321,0.15,0.367
Churn,0.132,0.132,0.035,0.176,1.0,0.53,0.318,0.296,0.253,0.338,0.353,0.498,0.217,0.103
Complains,0.069,0.069,0.17,0.063,0.53,1.0,0.143,0.078,0.125,0.143,0.158,0.269,0.137,0.0
Customer Value,0.214,0.214,0.346,0.393,0.318,0.143,1.0,0.563,0.78,0.673,0.714,0.498,0.145,0.434
Distinct Called Numbers,0.241,0.241,0.514,0.437,0.296,0.078,0.563,1.0,0.321,0.824,0.763,0.447,0.157,0.219
Frequency of SMS,0.173,0.173,0.27,0.319,0.253,0.125,0.78,0.321,1.0,0.306,0.308,0.344,0.104,0.452
Frequency of use,0.235,0.235,0.55,0.447,0.338,0.143,0.673,0.824,0.306,1.0,0.937,0.536,0.174,0.422

Unnamed: 0,Call Failure,Complains,Subscription Length,Charge Amount,Seconds of Use,Frequency of use,Frequency of SMS,Distinct Called Numbers,Age Group,Tariff Plan,Status,Age,Customer Value,Churn
0,8,0,38,0,4370,71,5,17,3,1,1,30,197.64,0
1,0,0,39,0,318,5,7,4,2,1,2,25,46.035,0
2,10,0,37,0,2453,60,359,24,3,1,1,30,1536.52,0
3,10,0,38,0,4198,66,1,35,1,1,1,15,240.02,0
4,3,0,38,0,2393,58,2,33,1,1,1,15,145.805,0
5,11,0,38,1,3775,82,32,28,3,1,1,30,282.28,0
6,4,0,38,0,2360,39,285,18,3,1,1,30,1235.96,0
7,13,0,37,2,9115,121,144,43,3,1,1,30,945.44,0
8,7,0,38,0,13773,169,0,44,3,1,1,30,557.68,0
9,7,0,38,1,4515,83,2,25,3,1,1,30,191.92,0

Unnamed: 0,Call Failure,Complains,Subscription Length,Charge Amount,Seconds of Use,Frequency of use,Frequency of SMS,Distinct Called Numbers,Age Group,Tariff Plan,Status,Age,Customer Value,Churn
3140,16,0,29,0,1005,31,17,9,3,1,2,30,109.44,0
3141,5,0,28,0,1130,16,28,5,4,1,2,45,98.65,0
3142,15,0,27,1,1530,38,26,15,2,1,1,25,187.56,0
3143,7,0,27,1,3530,67,15,25,3,1,1,30,203.88,0
3144,7,0,20,1,2000,32,35,16,3,1,1,30,221.28,0
3145,21,0,19,2,6697,147,92,44,2,2,1,25,721.98,0
3146,17,0,17,1,9237,177,80,42,5,1,1,55,261.21,0
3147,13,0,18,4,3157,51,38,21,3,1,1,30,280.32,0
3148,7,0,11,2,4695,46,222,12,3,1,1,30,1077.64,0
3149,8,1,11,2,1792,25,7,9,3,1,1,30,100.68,1

Unnamed: 0,Call Failure,Complains,Subscription Length,Charge Amount,Seconds of Use,Frequency of use,Frequency of SMS,Distinct Called Numbers,Age Group,Tariff Plan,Status,Age,Customer Value,Churn,# duplicates
29,0,0,35,0,0,0,0,0,2,1,2,25,0.0,1,6
42,0,0,37,0,0,0,0,0,2,1,2,25,0.0,0,6
24,0,0,34,0,0,0,0,0,5,1,1,55,0.0,0,5
43,0,0,37,0,0,0,0,0,2,1,2,25,0.0,1,5
22,0,0,34,0,0,0,0,0,2,1,2,25,0.0,1,4
28,0,0,35,0,0,0,0,0,2,1,2,25,0.0,0,4
33,0,0,36,0,0,0,0,0,2,1,2,25,0.0,0,4
85,5,0,39,0,305,6,7,2,2,1,2,25,45.495,1,4
1,0,0,7,0,4085,34,208,8,3,1,1,30,996.76,0,3
2,0,0,16,0,1390,20,21,12,2,1,1,25,157.95,0,3


Step 4


In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load dataset from Parquet
parquet_path = "/content/customer_churn.parquet"
df = pd.read_parquet(parquet_path)

# Ensure reproducibility
random_seed = 42

# Split dataset into Training (60%), Remaining (40%)
train_df, temp_df = train_test_split(df, test_size=0.4, random_state=random_seed)

# Split Remaining dataset into Test (20%) and Production (20%)
test_df, prod_df = train_test_split(temp_df, test_size=0.5, random_state=random_seed)

# Save the splits as Parquet files
train_path = "customer_churn_train.parquet"
test_path = "customer_churn_test.parquet"
prod_path = "customer_churn_prod.parquet"

train_df.to_parquet(train_path, index=False)
test_df.to_parquet(test_path, index=False)
prod_df.to_parquet(prod_path, index=False)

print(f"Train, Test, and Production datasets saved successfully:\n"
      f"Training Set: {train_path}\n"
      f"Test Set: {test_path}\n"
      f"Production Set: {prod_path}")


Train, Test, and Production datasets saved successfully:
Training Set: customer_churn_train.parquet
Test Set: customer_churn_test.parquet
Production Set: customer_churn_prod.parquet


Step 5
