In [28]:
import pandas as pd
import numpy as np

np.random.seed(123)

# Generate numerical columns with NaN values
num1 = np.random.randn(500)
num2 = np.random.uniform(10, 50, size=500)
num3 = np.random.randint(1, 6, size=500).astype('float')
num1[np.random.choice(500, 60, replace=False)] = np.nan
num2[np.random.choice(500, 60, replace=False)] = np.nan
num3[np.random.choice(500, 60, replace=False)] = np.nan

# Generate messy categorical/text columns
cat1 = np.random.choice(['apple', 'banana', 'grape', np.nan], size=500, p=[0.3, 0.3, 0.3, 0.1])
cat2 = np.random.choice(['A', 'B', 'C', 'D'], size=500)
cat3 = np.random.choice(['Yes', 'No', np.nan], size=500, p=[0.45, 0.45, 0.10])

# Combine into DataFrame and add untidiness
df_untidy = pd.DataFrame({
    'Score': num1,
    'Height_cm': num2,
    'Rating': num3,
    'Fruit': cat1,
    'Group': cat2,
    'IsActive': cat3
})

# Add untidy issues:
df_untidy.loc[df_untidy.sample(frac=0.15, random_state=1).index, 'Height_cm'] = \
    df_untidy['Height_cm'].dropna().astype(str) + 'cm'   # Mix data type in Height_cm

df_untidy.loc[df_untidy.sample(frac=0.15, random_state=2).index, 'Rating'] = \
    'Rating: ' + df_untidy['Rating'].dropna().astype(str) # Prefix string for some ratings

# Clean 'Height_cm' and 'Rating' columns
df_untidy['Height_cm'] = df_untidy['Height_cm'].astype(str).str.replace('cm', '', regex=False)
df_untidy['Height_cm'] = pd.to_numeric(df_untidy['Height_cm'], errors='coerce')

df_untidy['Rating'] = df_untidy['Rating'].astype(str).str.replace('Rating: ', '', regex=False)
df_untidy['Rating'] = pd.to_numeric(df_untidy['Rating'], errors='coerce')


df_untidy.head()

 '47.7445897977863cm' '12.199264419087633cm' '28.255294989080216cm'
 '49.46319556401813cm' '24.62082473109778cm' '14.069842039264948cm'
 '30.047591231707173cm' '39.596908878071915cm' '47.903273405497885cm'
 '18.383678364591226cm' '20.573119172482315cm' '26.918929348570554cm'
 '49.60956000856316cm' '42.872544601068384cm' '47.28354972175643cm' nan
 nan '36.95711914375434cm' nan '25.124837252276944cm'
 '41.69467488795425cm' '28.67398840853555cm' '31.546329651552767cm'
 '14.629781437963878cm' '10.509502547356245cm' '18.213990989134608cm'
 '24.690622519316705cm' '23.881006122675878cm' '16.30134384410546cm' nan
 '29.66207244015815cm' nan '45.91010976816148cm' nan
 '31.599083317303908cm' '25.60501244185592cm' '43.06120649429532cm' nan
 '12.847775069640711cm' '12.664116985961948cm' '47.658679896221cm'
 '33.2449203552472cm' '39.08764020242252cm' '28.33330050691903cm'
 '22.962576174780793cm' '27.894375469792145cm' '32.038131009330215cm'
 '14.575393291450958cm' nan nan '11.712756967808463cm' nan


Unnamed: 0,Score,Height_cm,Rating,Fruit,Group,IsActive
0,-1.085631,,2.0,banana,D,Yes
1,0.997345,16.480034,5.0,apple,A,No
2,0.282978,49.244711,,banana,B,No
3,-1.506295,,3.0,grape,D,
4,-0.5786,31.599083,,banana,C,No


In [42]:
print("1.Identify columns with missing values and demonstrate at least two methods for imputing or filling these missing values (e.g., mean for numerics, mode for categoricals).")
print("Missing values before imputation:")
print(df_untidy.isnull().sum())
print("\n")
df_untidy['Score'] = df_untidy['Score'].fillna(df_untidy['Score'].mean())
df_untidy['Fruit'] = df_untidy['Fruit'].fillna(df_untidy['Fruit'].mode()[0])
df_untidy['IsActive'] = df_untidy['IsActive'].fillna(df_untidy['IsActive'].mode()[0])
print("Missing values after initial imputation:")
print(df_untidy.isnull().sum())

1.Identify columns with missing values and demonstrate at least two methods for imputing or filling these missing values (e.g., mean for numerics, mode for categoricals).
Missing values before imputation:
Score            0
Height_cm        0
Rating           0
Fruit            0
Group            0
IsActive         0
Group_encoded    0
dtype: int64


Missing values after initial imputation:
Score            0
Height_cm        0
Rating           0
Fruit            0
Group            0
IsActive         0
Group_encoded    0
dtype: int64


In [45]:
print("2.Identify columns with non-numeric (categorical) data and convert them into a numeric format using encoding techniques such as one-hot encoding or label encoding.")
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df_untidy['Group_encoded'] = le.fit_transform(df_untidy['Group'])
df = pd.get_dummies(df_untidy, columns=['Fruit', 'IsActive'], drop_first=True)
print("DataFrame after encoding:")
print(df.head())

2.Identify columns with non-numeric (categorical) data and convert them into a numeric format using encoding techniques such as one-hot encoding or label encoding.
DataFrame after encoding:
      Score  Height_cm    Rating Group  Group_encoded  Fruit_banana  \
0 -1.085631  29.620189  2.000000     D              3          True   
1  0.997345  16.480034  5.000000     A              0         False   
2  0.282978  49.244711  3.022727     B              1          True   
3 -1.506295  29.620189  3.000000     D              3         False   
4 -0.578600  31.599083  3.022727     C              2          True   

   Fruit_grape  Fruit_nan  IsActive_Yes  IsActive_nan  
0        False      False          True         False  
1        False      False         False         False  
2        False      False         False         False  
3         True      False         False          True  
4        False      False         False         False  


In [51]:
print("3.Detect any columns in the DataFrame that contain mixed data types (such as numbers stored as strings or strings with prefixes). Write code to clean and convert these columns to appropriate, consistent types.")
df_untidy['Height_cm'] = df_untidy['Height_cm'].astype(str).str.replace('cm', '', regex=False)
df_untidy['Height_cm'] = pd.to_numeric(df_untidy['Height_cm'], errors='coerce')
df_untidy['Rating'] = df_untidy['Rating'].astype(str).str.replace('Rating: ', '', regex=False)
df_untidy['Rating'] = pd.to_numeric(df_untidy['Rating'], errors='coerce')
df_untidy['Height_cm'] = df_untidy['Height_cm'].fillna(df_untidy['Height_cm'].mean())
df_untidy['Rating'] = df_untidy['Rating'].fillna(df_untidy['Rating'].mean())

print(df_untidy.head())

3.Detect any columns in the DataFrame that contain mixed data types (such as numbers stored as strings or strings with prefixes). Write code to clean and convert these columns to appropriate, consistent types.
      Score  Height_cm    Rating   Fruit Group IsActive  Group_encoded
0 -1.085631  29.620189  2.000000  banana     D      Yes              3
1  0.997345  16.480034  5.000000   apple     A       No              0
2  0.282978  49.244711  3.022727  banana     B       No              1
3 -1.506295  29.620189  3.000000   grape     D      nan              3
4 -0.578600  31.599083  3.022727  banana     C       No              2


In [54]:
print("4.Apply scaling and/or normalization techniques (such as Min-Max Scaling and Standardization) to the numerical columns to prepare them for downstream machine learning tasks.")
from sklearn.preprocessing import MinMaxScaler, StandardScaler
numerical_cols = ['Score', 'Height_cm', 'Rating']
df_to_scale = df_untidy[numerical_cols].copy()
scaler_minmax = MinMaxScaler()
df_untidy['Score_minmax_scaled'] = scaler_minmax.fit_transform(df_to_scale[['Score']])
scaler_standard = StandardScaler()
df_untidy[['Height_cm_standard_scaled', 'Rating_standard_scaled']] = scaler_standard.fit_transform(df_to_scale[['Height_cm', 'Rating']])
print("DataFrame after scaling (showing scaled columns):")
print(df_untidy[['Score_minmax_scaled', 'Height_cm_standard_scaled', 'Rating_standard_scaled']].head())

4.Apply scaling and/or normalization techniques (such as Min-Max Scaling and Standardization) to the numerical columns to prepare them for downstream machine learning tasks.
DataFrame after scaling (showing scaled columns):
   Score_minmax_scaled  Height_cm_standard_scaled  Rating_standard_scaled
0             0.346613               3.265462e-16               -0.785426
1             0.683137              -1.207772e+00                1.518490
2             0.567725               1.803780e+00                0.000000
3             0.278651               3.265462e-16               -0.017454
4             0.428529               1.818893e-01                0.000000


In [56]:
def validate_data_quality(df_untidy):
    print("5.Write a function to check for and report any remaining inconsistencies (missing values, mixed types, out-of-range values) in the cleaned DataFrame. Validate that the preprocessing steps have successfully prepared the data for analysis.")
    missing_values = df_untidy.isnull().sum()
    print("Remaining missing values per column:")
    print(missing_values[missing_values > 0])
    print("\n")
    print("Checking for mixed data types in numerical columns:")
    mixed_types = {}
    for col in df_untidy.columns:
        if pd.api.types.is_numeric_dtype(df_untidy[col]):
            if (df_untidy[col].dropna().apply(type).nunique() > 1):
                mixed_types[col] = df_untidy[col].apply(type).unique()
    if mixed_types:
        print("Mixed data types found in the following columns:", mixed_types)
    else:
        print("No mixed data types found in numerical columns.")
    print("\n")
    print("Checking for out-of-range values (e.g., negative height or rating < 0):")
    out_of_range = {}
    if 'Height_cm' in df_untidy.columns and (df_untidy['Height_cm'] < 0).any():
        out_of_range['Height_cm'] = 'Contains negative values.'
    if 'Rating' in df_untidy.columns and (df_untidy['Rating'] < 1).any():
        out_of_range['Rating'] = 'Contains values less than 1.'
    if out_of_range:
        print("Out-of-range values found:", out_of_range)
    else:
        print("No out-of-range values found.")
    print("\n")
    print("Data validation complete. The dataset is now clean and ready for analysis." if not missing_values.any() and not mixed_types and not out_of_range else "Inconsistencies remain.")
validate_data_quality(df_untidy)

5.Write a function to check for and report any remaining inconsistencies (missing values, mixed types, out-of-range values) in the cleaned DataFrame. Validate that the preprocessing steps have successfully prepared the data for analysis.
Remaining missing values per column:
Series([], dtype: int64)


Checking for mixed data types in numerical columns:
No mixed data types found in numerical columns.


Checking for out-of-range values (e.g., negative height or rating < 0):
No out-of-range values found.


Data validation complete. The dataset is now clean and ready for analysis.
