In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("quality_dataset_100.csv")

In [None]:
df

Unnamed: 0,Record_ID,Value_1,Value_2,Duplicate_Column,Error_Column
0,1,51.1,1.0,1,10.5
1,2,52.2,2.0,2,11.0
2,3,53.3,3.0,1,11.5
3,4,54.4,4.0,2,12.0
4,5,55.5,5.0,1,12.5
...,...,...,...,...,...
95,96,155.6,96.0,2,58.0
96,97,156.7,97.0,1,58.5
97,98,157.8,98.0,2,59.0
98,99,158.9,99.0,1,59.5


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Record_ID         100 non-null    int64  
 1   Value_1           90 non-null     float64
 2   Value_2           94 non-null     float64
 3   Duplicate_Column  100 non-null    int64  
 4   Error_Column      100 non-null    object 
dtypes: float64(2), int64(2), object(1)
memory usage: 4.0+ KB


In [None]:
print(df.isnull().sum())

Record_ID            0
Value_1             10
Value_2              6
Duplicate_Column     0
Error_Column         0
dtype: int64


In [None]:
df.duplicated().sum()

0

In [None]:
df.describe()

Unnamed: 0,Record_ID,Value_1,Value_2,Duplicate_Column
count,100.0,90.0,94.0,100.0
mean,50.5,105.0,50.37234,1.5
std,29.011492,31.900213,29.212232,0.502519
min,1.0,51.1,1.0,1.0
25%,25.75,77.775,25.25,1.0
50%,50.5,105.0,50.5,1.5
75%,75.25,132.225,75.5,2.0
max,100.0,158.9,100.0,2.0


Handling missing values:

In [None]:
df.fillna(df.median(numeric_only=True), inplace=True)

After handling the missing value now there is no missing values in the datset.

In [None]:
df.isnull().sum()

Unnamed: 0,0
Record_ID,0
Value_1,0
Value_2,0
Duplicate_Column,0
Error_Column,0


In [None]:
# Replace "ERROR" in the Error_Column with the mean of valid values
df["Error_Column"] = pd.to_numeric(df["Error_Column"], errors="coerce")  # Convert to numeric, replacing errors with NaN
df["Error_Column"].fillna(df["Error_Column"].mean(), inplace=True)  # Replace NaN with mean



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Error_Column"].fillna(df["Error_Column"].mean(), inplace=True)  # Replace NaN with mean


In [None]:
# Verify the changes
print("Error Column After Correction:\n", df["Error_Column"].head())

Error Column After Correction:
 0    10.5
1    11.0
2    11.5
3    12.0
4    12.5
Name: Error_Column, dtype: float64


In [None]:
# Calculate completeness percentage
completeness = (1 - df.isnull().sum().sum() / df.size) * 100
print(f"Data Completeness: {completeness:.2f}%")

# Calculate uniqueness for each column
for col in df.columns:
    unique_percentage = df[col].nunique() / len(df) * 100
    print(f"Column {col}: {unique_percentage:.2f}% unique values")

Data Completeness: 100.00%
Column Record_ID: 100.00% unique values
Column Value_1: 91.00% unique values
Column Value_2: 95.00% unique values
Column Duplicate_Column: 2.00% unique values
Column Error_Column: 97.00% unique values


In [None]:
# Check for duplicate rows
duplicate_rows = df[df.duplicated()]
print(f"Number of duplicate rows: {len(duplicate_rows)}")


Number of duplicate rows: 0


In [None]:
# Remove duplicate rows
df_cleaned = df.drop_duplicates()
print("Duplicates removed.")



Duplicates removed.
