# Practise

### Dataset for practise

In [1]:
import pandas as pd
import numpy as np

np.random.seed(123)

# Generate numerical columns with NaN values
num1 = np.random.randn(500)
num2 = np.random.uniform(10, 50, size=500)
num3 = np.random.randint(1, 6, size=500).astype('float')
num1[np.random.choice(500, 60, replace=False)] = np.nan
num2[np.random.choice(500, 60, replace=False)] = np.nan
num3[np.random.choice(500, 60, replace=False)] = np.nan

# Generate messy categorical/text columns
cat1 = np.random.choice(['apple', 'banana', 'grape', np.nan], size=500, p=[0.3, 0.3, 0.3, 0.1])
cat2 = np.random.choice(['A', 'B', 'C', 'D'], size=500)
cat3 = np.random.choice(['Yes', 'No', np.nan], size=500, p=[0.45, 0.45, 0.10])

# Combine into DataFrame and add untidiness
df_untidy = pd.DataFrame({
    'Score': num1,
    'Height_cm': num2,
    'Rating': num3,
    'Fruit': cat1,
    'Group': cat2,
    'IsActive': cat3
})

# Add untidy issues:
df_untidy.loc[df_untidy.sample(frac=0.15, random_state=1).index, 'Height_cm'] = \
    df_untidy['Height_cm'].dropna().astype(str) + 'cm'   # Mix data type in Height_cm

df_untidy.loc[df_untidy.sample(frac=0.15, random_state=2).index, 'Rating'] = \
    'Rating: ' + df_untidy['Rating'].dropna().astype(str) # Prefix string for some ratings

df_untidy.head()


 '47.7445897977863cm' '12.199264419087633cm' '28.255294989080216cm'
 '49.46319556401813cm' '24.62082473109778cm' '14.069842039264948cm'
 '30.047591231707173cm' '39.596908878071915cm' '47.903273405497885cm'
 '18.383678364591226cm' '20.573119172482315cm' '26.918929348570554cm'
 '49.60956000856316cm' '42.872544601068384cm' '47.28354972175643cm' nan
 nan '36.95711914375434cm' nan '25.124837252276944cm'
 '41.69467488795425cm' '28.67398840853555cm' '31.546329651552767cm'
 '14.629781437963878cm' '10.509502547356245cm' '18.213990989134608cm'
 '24.690622519316705cm' '23.881006122675878cm' '16.30134384410546cm' nan
 '29.66207244015815cm' nan '45.91010976816148cm' nan
 '31.599083317303908cm' '25.60501244185592cm' '43.06120649429532cm' nan
 '12.847775069640711cm' '12.664116985961948cm' '47.658679896221cm'
 '33.2449203552472cm' '39.08764020242252cm' '28.33330050691903cm'
 '22.962576174780793cm' '27.894375469792145cm' '32.038131009330215cm'
 '14.575393291450958cm' nan nan '11.712756967808463cm' nan


Unnamed: 0,Score,Height_cm,Rating,Fruit,Group,IsActive
0,-1.085631,,2.0,banana,D,Yes
1,0.997345,16.480034,Rating: 5.0,apple,A,No
2,0.282978,49.244711,,banana,B,No
3,-1.506295,,3.0,grape,D,
4,-0.5786,31.599083317303908cm,,banana,C,No


- Q1. Identify columns with missing values and demonstrate at least two methods for imputing or filling these missing values (e.g., mean for numerics, mode for categoricals).

In [2]:
import pandas as pd
import numpy as np

# Convert messy numerics to proper numeric
df_untidy['Height_cm'] = pd.to_numeric(df_untidy['Height_cm'].astype(str).str.replace('cm','', regex=True), errors='coerce')
df_untidy['Rating'] = pd.to_numeric(df_untidy['Rating'].astype(str).str.replace('Rating: ','', regex=True), errors='coerce')

# Fill missing values: mean for numeric, mode for categorical
for col in df_untidy.columns:
    if df_untidy[col].dtype in [np.float64, np.int64]:
        df_untidy[col].fillna(df_untidy[col].mean(), inplace=True)
    else:
        df_untidy[col].fillna(df_untidy[col].mode()[0], inplace=True)

# Verify
print(df_untidy.isnull().sum())


Score        0
Height_cm    0
Rating       0
Fruit        0
Group        0
IsActive     0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_untidy[col].fillna(df_untidy[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_untidy[col].fillna(df_untidy[col].mode()[0], inplace=True)


- Q2.  Identify columns with non-numeric (categorical) data and convert them into a numeric format using encoding techniques such as one-hot encoding or label encoding.

In [3]:
import pandas as pd
import numpy as np

# Convert messy numerics to proper numeric
df_untidy['Height_cm'] = pd.to_numeric(df_untidy['Height_cm'].astype(str).str.replace('cm','', regex=True), errors='coerce')
df_untidy['Rating'] = pd.to_numeric(df_untidy['Rating'].astype(str).str.replace('Rating: ','', regex=True), errors='coerce')

# Fill missing values: mean for numeric, mode for categorical
for col in df_untidy.columns:
    if df_untidy[col].dtype in [np.float64, np.int64]:
        df_untidy[col].fillna(df_untidy[col].mean(), inplace=True)
    else:
        df_untidy[col].fillna(df_untidy[col].mode()[0], inplace=True)

# Identify categorical columns
categorical_cols = df_untidy.select_dtypes(include=['object', 'category']).columns

# One-hot encode categorical columns
df_final = pd.get_dummies(df_untidy, columns=categorical_cols, drop_first=True)

# Verify
print(df_final.head())
print(df_final.isnull().sum().sum(), "missing values remaining")


      Score  Height_cm    Rating  Fruit_banana  Fruit_grape  Fruit_nan  \
0 -1.085631  29.620189  2.000000          True        False      False   
1  0.997345  16.480034  5.000000         False        False      False   
2  0.282978  49.244711  3.022727          True        False      False   
3 -1.506295  29.620189  3.000000         False         True      False   
4 -0.578600  31.599083  3.022727          True        False      False   

   Group_B  Group_C  Group_D  IsActive_Yes  IsActive_nan  
0    False    False     True          True         False  
1    False    False    False         False         False  
2     True    False    False         False         False  
3    False    False     True         False          True  
4    False     True    False         False         False  
0 missing values remaining


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_untidy[col].fillna(df_untidy[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_untidy[col].fillna(df_untidy[col].mode()[0], inplace=True)


- Q3. Detect any columns in the DataFrame that contain mixed data types (such as numbers stored as strings or strings with prefixes). Write code to clean and convert these columns to appropriate, consistent types.

In [4]:
import pandas as pd
import numpy as np

# Detect mixed-type columns
def detect_mixed_types(df):
    mixed_cols = []
    for col in df.columns:
        types = df[col].map(type).unique()
        if len(types) > 1:
            mixed_cols.append(col)
    return mixed_cols

mixed_type_cols = detect_mixed_types(df_untidy)
print("Columns with mixed types:", mixed_type_cols)

# Clean and convert mixed-type columns to numeric
for col in mixed_type_cols:
    df_untidy[col] = pd.to_numeric(
        df_untidy[col].astype(str).str.replace('cm','', regex=True)
                              .str.replace('Rating: ','', regex=True),
        errors='coerce'
    )

# Fill missing values: mean for numeric, mode for categorical
for col in df_untidy.columns:
    if df_untidy[col].dtype in [np.float64, np.int64]:
        df_untidy[col].fillna(df_untidy[col].mean(), inplace=True)
    else:
        df_untidy[col].fillna(df_untidy[col].mode()[0], inplace=True)

# One-hot encode categorical columns
categorical_cols = df_untidy.select_dtypes(include=['object', 'category']).columns
df_final = pd.get_dummies(df_untidy, columns=categorical_cols, drop_first=True)

# Verify
print(df_final.head())
print("Missing values remaining:", df_final.isnull().sum().sum())


Columns with mixed types: []
      Score  Height_cm    Rating  Fruit_banana  Fruit_grape  Fruit_nan  \
0 -1.085631  29.620189  2.000000          True        False      False   
1  0.997345  16.480034  5.000000         False        False      False   
2  0.282978  49.244711  3.022727          True        False      False   
3 -1.506295  29.620189  3.000000         False         True      False   
4 -0.578600  31.599083  3.022727          True        False      False   

   Group_B  Group_C  Group_D  IsActive_Yes  IsActive_nan  
0    False    False     True          True         False  
1    False    False    False         False         False  
2     True    False    False         False         False  
3    False    False     True         False          True  
4    False     True    False         False         False  
Missing values remaining: 0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_untidy[col].fillna(df_untidy[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_untidy[col].fillna(df_untidy[col].mode()[0], inplace=True)


- Q4. Apply scaling and/or normalization techniques (such as Min-Max Scaling and Standardization) to the numerical columns to prepare them for downstream machine learning tasks.

In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# 1. Detect mixed-type columns
def detect_mixed_types(df):
    mixed_cols = []
    for col in df.columns:
        types = df[col].map(type).unique()
        if len(types) > 1:
            mixed_cols.append(col)
    return mixed_cols

mixed_type_cols = detect_mixed_types(df_untidy)
print("Columns with mixed types:", mixed_type_cols)

# 2. Clean and convert mixed-type columns to numeric
for col in mixed_type_cols:
    df_untidy[col] = pd.to_numeric(
        df_untidy[col].astype(str).str.replace('cm','', regex=True)
                              .str.replace('Rating: ','', regex=True),
        errors='coerce'
    )

# 3. Fill missing values: mean for numeric, mode for categorical
for col in df_untidy.columns:
    if df_untidy[col].dtype in [np.float64, np.int64]:
        df_untidy[col].fillna(df_untidy[col].mean(), inplace=True)
    else:
        df_untidy[col].fillna(df_untidy[col].mode()[0], inplace=True)

# 4. One-hot encode categorical columns
categorical_cols = df_untidy.select_dtypes(include=['object', 'category']).columns
df_encoded = pd.get_dummies(df_untidy, columns=categorical_cols, drop_first=True)

# 5. Apply Min-Max Scaling to numeric columns
numeric_cols = df_encoded.select_dtypes(include=[np.number]).columns
scaler = MinMaxScaler()
df_encoded[numeric_cols] = scaler.fit_transform(df_encoded[numeric_cols])

# 6. Verify
print(df_encoded.head())
print("Missing values remaining:", df_encoded.isnull().sum().sum())


Columns with mixed types: []
      Score  Height_cm    Rating  Fruit_banana  Fruit_grape  Fruit_nan  \
0  0.346613   0.491200  0.250000          True        False      False   
1  0.683137   0.162150  1.000000         False        False      False   
2  0.567725   0.982630  0.505682          True        False      False   
3  0.278651   0.491200  0.500000         False         True      False   
4  0.428529   0.540755  0.505682          True        False      False   

   Group_B  Group_C  Group_D  IsActive_Yes  IsActive_nan  
0    False    False     True          True         False  
1    False    False    False         False         False  
2     True    False    False         False         False  
3    False    False     True         False          True  
4    False     True    False         False         False  
Missing values remaining: 0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_untidy[col].fillna(df_untidy[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_untidy[col].fillna(df_untidy[col].mode()[0], inplace=True)


- Q5. Write a function to check for and report any remaining inconsistencies (missing values, mixed types, out-of-range values) in the cleaned DataFrame. Validate that the preprocessing steps have successfully prepared the data for analysis.

In [6]:
import pandas as pd
import numpy as np

def validate_dataframe(df, numeric_range=None):
    """
    Checks for inconsistencies in a DataFrame:
    - Missing values
    - Mixed data types
    - Out-of-range numeric values (optional)
    
    Parameters:
        df (pd.DataFrame): DataFrame to validate
        numeric_range (dict): Optional dict with column names as keys and (min,max) tuples as values
    
    Returns:
        None, prints a summary of issues
    """
    print("=== DataFrame Validation ===\n")
    
    # 1. Check for missing values
    missing = df.isnull().sum()
    if missing.sum() == 0:
        print("✅ No missing values found.")
    else:
        print("⚠ Missing values detected:")
        print(missing[missing > 0])
    
    # 2. Check for mixed types
    mixed_cols = []
    for col in df.columns:
        types = df[col].map(type).unique()
        if len(types) > 1:
            mixed_cols.append(col)
    if not mixed_cols:
        print("\n✅ No mixed-type columns found.")
    else:
        print("\n⚠ Mixed-type columns detected:", mixed_cols)
    
    # 3. Check for out-of-range values if numeric_range is provided
    if numeric_range:
        print("\nChecking numeric ranges...")
        for col, (min_val, max_val) in numeric_range.items():
            if col in df.columns:
                below_min = (df[col] < min_val).sum()
                above_max = (df[col] > max_val).sum()
                if below_min > 0 or above_max > 0:
                    print(f"⚠ {col}: {below_min} values below {min_val}, {above_max} values above {max_val}")
                else:
                    print(f"✅ {col} within specified range.")
    print("\nValidation complete.")

# Example usage:
# Assume df_encoded is the cleaned, numeric ML-ready DataFrame from previous steps
validate_dataframe(df_encoded, numeric_range={'Score': (0,1), 'Height_cm': (0,1), 'Rating': (0,1)})


=== DataFrame Validation ===

✅ No missing values found.

✅ No mixed-type columns found.

Checking numeric ranges...
✅ Score within specified range.
⚠ Height_cm: 0 values below 0, 1 values above 1
✅ Rating within specified range.

Validation complete.
