In [1]:
pip list

Package                           Version
--------------------------------- -------------------
aiobotocore                       2.19.0
aiohappyeyeballs                  2.4.4
aiohttp                           3.11.10
aioitertools                      0.7.1
aiosignal                         1.2.0
alabaster                         0.7.16
altair                            5.5.0
anaconda-anon-usage               0.7.1
anaconda-auth                     0.8.6
anaconda-catalogs                 0.2.0
anaconda-cli-base                 0.5.2
anaconda-client                   1.13.0
anaconda-navigator                2.6.6
anaconda-project                  0.11.1
annotated-types                   0.6.0
anyio                             4.7.0
appdirs                           1.4.4
archspec                          0.2.3
argon2-cffi                       21.3.0
argon2-cffi-bindings              21.2.0
arrow                             1.3.0
astroid                           3.3.8
astropy         

In [2]:
import pandas as pd
data = pd.read_csv("telecom_data.csv")
print(data.head(10))
data.isnull().sum()

   Age  Gender  PlanType  MonthlyUsage Churn
0   21  Female   Regular            15    No
1   45  Female   Economy            41    No
2   44  Female   Economy            40    No
3   31  Female   Regular            23   Yes
4   33  Female   Regular            12    No
5   42  Female   Regular            52    No
6   20  Female     Ultra            57   Yes
7   26    Male     Ultra            23    No
8   37  Female  Advanced            31    No
9   26    Male   Economy            23    No


Age             0
Gender          0
PlanType        0
MonthlyUsage    0
Churn           0
dtype: int64

In [3]:
# Inspect columns and basic info
print('Columns:', data.columns.tolist())

#1.Dataset Basic Info
print('\nDataset Info:')
print(data.info())


#2.Completeness: Check for missing values:
#Checks if any column has missing/null values.
print('\nDataset Completeness:')
print(data.isnull().sum())


#3.Dataset Consistency: All numeric columns should be numeric.
print('\nDataset Consistency:')
print(data.dtypes)           # Check datatype consistency


Columns: ['Age', 'Gender', 'PlanType', 'MonthlyUsage', 'Churn']

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Age           150 non-null    int64 
 1   Gender        150 non-null    object
 2   PlanType      150 non-null    object
 3   MonthlyUsage  150 non-null    int64 
 4   Churn         150 non-null    object
dtypes: int64(2), object(3)
memory usage: 6.0+ KB
None

Dataset Completeness:
Age             0
Gender          0
PlanType        0
MonthlyUsage    0
Churn           0
dtype: int64

Dataset Consistency:
Age              int64
Gender          object
PlanType        object
MonthlyUsage     int64
Churn           object
dtype: object


In [4]:
print('\nDataset Describe:')
data.describe()


Dataset Describe:


Unnamed: 0,Age,MonthlyUsage
count,150.0,150.0
mean,35.193333,33.693333
std,10.841566,15.923031
min,19.0,3.0
25%,25.0,23.0
50%,35.0,35.0
75%,44.0,50.0
max,54.0,59.0


In [5]:

#6.BIAS Check — Is data skewed toward 1 user group?
print('\nDataset Bias')
print(data['Gender'].value_counts(normalize=True))
data['Churn'].value_counts(normalize=True)
data['Age'].value_counts(normalize=True)


Dataset Bias
Gender
Female    0.793333
Male      0.206667
Name: proportion, dtype: float64


Age
45    0.106667
21    0.093333
44    0.093333
42    0.080000
20    0.080000
26    0.073333
37    0.060000
31    0.053333
54    0.046667
53    0.046667
24    0.040000
34    0.033333
35    0.033333
25    0.033333
43    0.026667
33    0.026667
38    0.020000
32    0.020000
19    0.020000
52    0.013333
Name: proportion, dtype: float64

In [None]:
## 33.4.2  Dataset Specification Sheet

| **Field**           | **Details** |
|---------------------|-------------|
| **Dataset Name**    | `telecom_data.csv` (demo dataset) |
| **Description**     | Contains student study behavior and academic performance data used to demonstrate Data Quality checks including Completeness, Consistency, Accuracy, Bias analysis, and basic feature engineering. |
| **Features**        | `Age`, `Gender`, `PlanType`, `MonthlyUsage` |
| **Target Variable** | *Churn* (Yes = churned, No = No churn) |
| **Number of Rows**  | 150 |
| **Null Handling**   | No missing values detected across any feature |
| **Transformations** | Label encoding applied to `class` → `class_encoded`; datatype validation performed; value distribution and proportion analysis conducted for `gender`, `grades`, and `class` |
| **Owner**           | Student / Notebook Author |
| **Last Updated**    | 2025-12-19 |

In [6]:
#separate features and targets , drop target column 'churn' from the main dataframe
fdata = data.drop(columns=['Churn'])
X = fdata
#encode target variable to numeric (yes=1 , No=0)
y=data["Churn"].map({'Yes':1,'No':0})

categorical = fdata.select_dtypes(include='object').columns
numerical = fdata.select_dtypes(exclude='object').columns

print("categorial features:", list(categorical))
print("numerial features:", list(numerical))

categorial features: ['Gender', 'PlanType']
numerial features: ['Age', 'MonthlyUsage']


In [7]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
#feature Encoding
#using pd.get_dummies()

X_encoded_gd = pd.get_dummies(
    X,
    columns=categorical,
    drop_first=True
)
#this print is using pandas
print("gd Encoded Columns: ",X_encoded_gd.columns.tolist())

#Using one hot encoding 
ohe = OneHotEncoder(
    drop='first',
    sparse_output=False
)

encoded_array = ohe.fit_transform(X[categorical])

#convert back to Dataframe
encoded_df=pd.DataFrame(
    encoded_array,
    columns=ohe.get_feature_names_out(categorical)
)

x_encoded_ohe = pd.concat(
    [X[numerical].reset_index(drop=True), encoded_df.reset_index(drop=True)],
    axis=1
)
#this print is using onehotencoder
print("one hot column:", x_encoded_ohe.columns.tolist())
print(encoded_array)  #1. standard , 2. economy , 3. regular , 4. Advanced , 5. Ultra

scaler = StandardScaler()
X_scaled_gd = X_encoded_gd.copy()
X_scaled_gd[numerical]=scaler.fit_transform(X_scaled_gd[numerical])
#feature scaling
#using onehotencoder()
X_scaled_ohe=x_encoded_ohe.copy()
X_scaled_ohe[numerical]=scaler.fit_transform(X_scaled_ohe[numerical])
df=pd.DataFrame({
    'Age_ohe':X_scaled_ohe[numerical]['Age'],
    'Age_gd':X_scaled_ohe[numerical]['Age'],
    'MonthlyUsage_ohe':X_scaled_ohe[numerical]['MonthlyUsage'],
    'MonthlyUsage_gd':X_scaled_ohe[numerical]['MonthlyUsage']
})
print("comparison_Table:\n",df)


gd Encoded Columns:  ['Age', 'MonthlyUsage', 'Gender_Male', 'PlanType_Economy', 'PlanType_Regular', 'PlanType_Standard', 'PlanType_Ultra']
one hot column: ['Age', 'MonthlyUsage', 'Gender_Male', 'PlanType_Economy', 'PlanType_Regular', 'PlanType_Standard', 'PlanType_Ultra']
[[0. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1.]
 [1. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0.]
 [1. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0.]
 [1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [1. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1.]
 [1. 0. 1. 0. 0.]
 [1. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1.]
 [1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0.]
 [1. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 1. 0. 0.]
 [1. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0.]
 [0. 0.

In [13]:
print(x_encoded_gd.Gender_Male)

0      False
1      False
2      False
3      False
4      False
       ...  
145    False
146    False
147    False
148    False
149    False
Name: Gender_Male, Length: 150, dtype: bool


In [16]:
print(x_encoded_gd.PlanType_Economy)

0      False
1       True
2       True
3      False
4      False
       ...  
145    False
146    False
147    False
148     True
149    False
Name: PlanType_Economy, Length: 150, dtype: bool


In [8]:
scaler = StandardScaler()
X_scaled_gd = X_encoded_gd.copy()
X_scaled_gd[numerical]=scaler.fit_transform(X_scaled_gd[numerical])
#feature scaling
#using onehotencoder()
X_scaled_ohe=x_encoded_ohe.copy()
X_scaled_ohe[numerical]=scaler.fit_transform(X_scaled_ohe[numerical])
df=pd.DataFrame({
    'Age_ohe':X_scaled_ohe[numerical]['Age'],
    'Age_gd':X_scaled_ohe[numerical]['Age'],
    'MonthlyUsage_ohe':X_scaled_ohe[numerical]['MonthlyUsage'],
    'MonthlyUsage_gd':X_scaled_ohe[numerical]['MonthlyUsage']
})
print("comparison_Table:\n",df)

comparison_Table:
       Age_ohe    Age_gd  MonthlyUsage_ohe  MonthlyUsage_gd
0   -1.313545 -1.313545         -1.177914        -1.177914
1    0.907574  0.907574          0.460411         0.460411
2    0.815027  0.815027          0.397399         0.397399
3   -0.388079 -0.388079         -0.673814        -0.673814
4   -0.202986 -0.202986         -1.366951        -1.366951
..        ...       ...               ...              ...
145 -1.498638 -1.498638         -1.934064        -1.934064
146  0.629934  0.629934          1.153549         1.153549
147  1.740493  1.740493         -1.682014        -1.682014
148 -1.406091 -1.406091          1.468612         1.468612
149 -0.388079 -0.388079         -0.673814        -0.673814

[150 rows x 4 columns]


In [11]:
#Train validation test split for model developement
from sklearn.model_selection import train_test_split

#split data into training set 80% and test set 20%
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled_ohe, #Preprocessed feature matrix
    y,            # Target variable (churn)
    test_size=0.20, # 20% hold-out test data
    random_state=42, # reproducibility
    stratify=y # preserve churn distribution(equal division)
)

pd.DataFrame({
    "Train": y_train.value_counts(normalize=True),
    "Test": y_test.value_counts(normalize=True)
})

Unnamed: 0_level_0,Train,Test
Churn,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.891667,0.9
1,0.108333,0.1
