In [16]:
import pandas as pd
import numpy as np

In [17]:
df = pd.read_csv("raw_dataset.csv")

In [18]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42.0,2,0.0,1,1.0,1.0,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41.0,1,83807.86,1,0.0,1.0,112542.58,0
2,3,15619304,Onio,502,France,Female,42.0,8,159660.8,3,1.0,0.0,113931.57,1
3,4,15701354,Boni,699,France,Female,39.0,1,0.0,2,0.0,0.0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43.0,2,125510.82,1,,1.0,79084.1,0


In [19]:
print("Dataset shape:", df.shape)
df.info()
df.describe(include='all')

Dataset shape: (10002, 14)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10002 entries, 0 to 10001
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10002 non-null  int64  
 1   CustomerId       10002 non-null  int64  
 2   Surname          10002 non-null  object 
 3   CreditScore      10002 non-null  int64  
 4   Geography        10001 non-null  object 
 5   Gender           10002 non-null  object 
 6   Age              10001 non-null  float64
 7   Tenure           10002 non-null  int64  
 8   Balance          10002 non-null  float64
 9   NumOfProducts    10002 non-null  int64  
 10  HasCrCard        10001 non-null  float64
 11  IsActiveMember   10001 non-null  float64
 12  EstimatedSalary  10002 non-null  float64
 13  Exited           10002 non-null  int64  
dtypes: float64(5), int64(6), object(3)
memory usage: 1.1+ MB


Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10002.0,10002.0,10002,10002.0,10001,10002,10001.0,10002.0,10002.0,10002.0,10001.0,10001.0,10002.0,10002.0
unique,,,2932,,3,2,,,,,,,,
top,,,Smith,,France,Male,,,,,,,,
freq,,,32,,5014,5458,,,,,,,,
mean,5001.4996,15690930.0,,650.555089,,,38.922311,5.012498,76491.112875,1.530194,0.705529,0.514949,100083.331145,0.203759
std,2887.472338,71931.77,,96.661615,,,10.4872,2.891973,62393.474144,0.581639,0.455827,0.499801,57508.117802,0.402812
min,1.0,15565700.0,,350.0,,,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,2501.25,15628520.0,,584.0,,,32.0,3.0,0.0,1.0,0.0,0.0,50983.75,0.0
50%,5001.5,15690730.0,,652.0,,,37.0,5.0,97198.54,1.0,1.0,1.0,100185.24,0.0
75%,7501.75,15753230.0,,718.0,,,44.0,7.0,127647.84,2.0,1.0,1.0,149383.6525,0.0


## Missing Values

In [20]:
missing_values = df.isnull().sum()
print("Missing values per column:\n", missing_values)

Missing values per column:
 RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          1
Gender             0
Age                1
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          1
IsActiveMember     1
EstimatedSalary    0
Exited             0
dtype: int64


## Duplicate Rows

In [30]:
duplicates = df.duplicated().sum()
print("Number of duplicate rows:", duplicates)


Number of duplicate rows: 2


## Value Error

In [31]:
print(df.dtypes)
numeric_cols = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary', 'Exited']
for col in numeric_cols:
    invalid_entries = pd.to_numeric(df[col], errors='coerce').isnull().sum()
    print(f"Invalid entries in {col}: {invalid_entries}")

RowNumber            int64
CustomerId           int64
Surname             object
CreditScore          int64
Geography           object
Gender              object
Age                float64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard          float64
IsActiveMember     float64
EstimatedSalary    float64
Exited               int64
dtype: object
Invalid entries in CreditScore: 0
Invalid entries in Age: 1
Invalid entries in Tenure: 0
Invalid entries in Balance: 0
Invalid entries in NumOfProducts: 0
Invalid entries in EstimatedSalary: 0
Invalid entries in Exited: 0


## Categorical Errors

In [32]:
categorical_cols = ['Geography', 'Gender']

for col in categorical_cols:
    print(f"\nUnique values in {col}:")
    print(df[col].value_counts())

df['Geography'] = df['Geography'].str.strip().str.title()
df['Gender'] = df['Gender'].str.strip().str.title()



Unique values in Geography:
Geography
France     5014
Germany    2510
Spain      2477
Name: count, dtype: int64

Unique values in Gender:
Gender
Male      5458
Female    4544
Name: count, dtype: int64


## Duplicate Id's

In [33]:
duplicate_ids = df['CustomerId'].duplicated().sum()
print("Duplicate CustomerId count:", duplicate_ids)

if not all(df['RowNumber'] == range(1, len(df)+1)):
    print("RowNumber sequence error detected.")


Duplicate CustomerId count: 2
RowNumber sequence error detected.


## Format Error

In [34]:
text_cols = ['Surname', 'Geography', 'Gender']
for col in text_cols:
    if df[col].str.contains(r'^\s|\s$', regex=True).any():
        print(f"Formatting issue found in column: {col}")

for col in text_cols:
    df[col] = df[col].str.strip()

## Invalid/out of range errors

In [35]:
negative_cols = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
for col in negative_cols:
    neg_values = df[df[col] < 0]
    if not neg_values.empty:
        print(f"Negative values found in '{col}': {neg_values.shape[0]} rows")

invalid_credit = df[(df['CreditScore'] < 300) | (df['CreditScore'] > 850)]
print("Invalid CreditScore values:", invalid_credit.shape[0])

invalid_age = df[(df['Age'] <= 0) | (df['Age'] > 120)]
print("Invalid Age values:", invalid_age.shape[0])

invalid_balance = df[df['Balance'] < 0]
print("Invalid Balance entries:", invalid_balance.shape[0])

invalid_products = df[df['NumOfProducts'] < 0]
print("Invalid NumOfProducts entries:", invalid_products.shape[0])

invalid_tenure = df[(df['Tenure'] < 0) | (df['Tenure'] > 10)]
print("Invalid Tenure values:", invalid_tenure.shape[0])


Invalid CreditScore values: 0
Invalid Age values: 0
Invalid Balance entries: 0
Invalid NumOfProducts entries: 0
Invalid Tenure values: 0
