# Practise

### Dataset for practise

In [5]:
import pandas as pd
import numpy as np

np.random.seed(123)

# Generate numerical columns with NaN values
num1 = np.random.randn(500)
num2 = np.random.uniform(10, 50, size=500)
num3 = np.random.randint(1, 6, size=500).astype('float')
num1[np.random.choice(500, 60, replace=False)] = np.nan
num2[np.random.choice(500, 60, replace=False)] = np.nan
num3[np.random.choice(500, 60, replace=False)] = np.nan

# Generate messy categorical/text columns
cat1 = np.random.choice(['apple', 'banana', 'grape', np.nan], size=500, p=[0.3, 0.3, 0.3, 0.1])
cat2 = np.random.choice(['A', 'B', 'C', 'D'], size=500)
cat3 = np.random.choice(['Yes', 'No', np.nan], size=500, p=[0.45, 0.45, 0.10])

# Combine into DataFrame and add untidiness
df_untidy = pd.DataFrame({
    'Score': num1,
    'Height_cm': num2,
    'Rating': num3,
    'Fruit': cat1,
    'Group': cat2,
    'IsActive': cat3
})

# Add untidy issues:
df_untidy.loc[df_untidy.sample(frac=0.15, random_state=1).index, 'Height_cm'] = \
    df_untidy['Height_cm'].dropna().astype(str) + 'cm'   # Mix data type in Height_cm

df_untidy.loc[df_untidy.sample(frac=0.15, random_state=2).index, 'Rating'] = \
    'Rating: ' + df_untidy['Rating'].dropna().astype(str) # Prefix string for some ratings

df_untidy.head()


 '47.7445897977863cm' '12.199264419087633cm' '28.255294989080216cm'
 '49.46319556401813cm' '24.62082473109778cm' '14.069842039264948cm'
 '30.047591231707173cm' '39.596908878071915cm' '47.903273405497885cm'
 '18.383678364591226cm' '20.573119172482315cm' '26.918929348570554cm'
 '49.60956000856316cm' '42.872544601068384cm' '47.28354972175643cm' nan
 nan '36.95711914375434cm' nan '25.124837252276944cm'
 '41.69467488795425cm' '28.67398840853555cm' '31.546329651552767cm'
 '14.629781437963878cm' '10.509502547356245cm' '18.213990989134608cm'
 '24.690622519316705cm' '23.881006122675878cm' '16.30134384410546cm' nan
 '29.66207244015815cm' nan '45.91010976816148cm' nan
 '31.599083317303908cm' '25.60501244185592cm' '43.06120649429532cm' nan
 '12.847775069640711cm' '12.664116985961948cm' '47.658679896221cm'
 '33.2449203552472cm' '39.08764020242252cm' '28.33330050691903cm'
 '22.962576174780793cm' '27.894375469792145cm' '32.038131009330215cm'
 '14.575393291450958cm' nan nan '11.712756967808463cm' nan


Unnamed: 0,Score,Height_cm,Rating,Fruit,Group,IsActive
0,-1.085631,,2.0,banana,D,Yes
1,0.997345,16.480034,Rating: 5.0,apple,A,No
2,0.282978,49.244711,,banana,B,No
3,-1.506295,,3.0,grape,D,
4,-0.5786,31.599083317303908cm,,banana,C,No


- Q1. Identify columns with missing values and demonstrate at least two methods for imputing or filling these missing values (e.g., mean for numerics, mode for categoricals).

In [6]:
print("Missing values in each column:")
print(df_untidy.isnull().sum())
df_num_imputed = df_untidy.copy()
df_num_imputed['Score'] = df_num_imputed['Score'].fillna(df_num_imputed['Score'].mean())
df_cat_imputed = df_untidy.copy()
df_cat_imputed['Fruit'] = df_cat_imputed['Fruit'].fillna(df_cat_imputed['Fruit'].mode()[0])
print("\nAfter numeric imputation (mean for Score):")
print(df_num_imputed[['Score']].head())

print("\nAfter categorical imputation (mode for Fruit):")
print(df_cat_imputed[['Fruit']].head())


Missing values in each column:
Score        60
Height_cm    60
Rating       60
Fruit         0
Group         0
IsActive      0
dtype: int64

After numeric imputation (mean for Score):
      Score
0 -1.085631
1  0.997345
2  0.282978
3 -1.506295
4 -0.578600

After categorical imputation (mode for Fruit):
    Fruit
0  banana
1   apple
2  banana
3   grape
4  banana


- Q2.  Identify columns with non-numeric (categorical) data and convert them into a numeric format using encoding techniques such as one-hot encoding or label encoding.

In [7]:
from sklearn.preprocessing import LabelEncoder
categorical_cols = df_untidy.select_dtypes(include=['object']).columns
print("Categorical columns:", list(categorical_cols))
df_label = df_untidy.copy()
le = LabelEncoder()
for col in categorical_cols:
    df_label[col] = le.fit_transform(df_label[col].astype(str))

print("\nAfter Label Encoding:")
print(df_label.head())
df_onehot = pd.get_dummies(df_untidy, columns=categorical_cols, drop_first=True)

print("\nAfter One-Hot Encoding:")
print(df_onehot.head())


Categorical columns: ['Height_cm', 'Rating', 'Fruit', 'Group', 'IsActive']

After Label Encoding:
      Score  Height_cm  Rating  Fruit  Group  IsActive
0 -1.085631        440       1      1      3         1
1  0.997345         68       9      0      0         0
2  0.282978        428      10      1      1         0
3 -1.506295        440       2      2      3         2
4 -0.578600        249      10      1      2         0

After One-Hot Encoding:
      Score  Height_cm_10.083676047329098  Height_cm_10.166739544303041  \
0 -1.085631                         False                         False   
1  0.997345                         False                         False   
2  0.282978                         False                         False   
3 -1.506295                         False                         False   
4 -0.578600                         False                         False   

   Height_cm_10.44643539536477  Height_cm_10.56575222388172  \
0                        False   

- Q3. Detect any columns in the DataFrame that contain mixed data types (such as numbers stored as strings or strings with prefixes). Write code to clean and convert these columns to appropriate, consistent types.

In [8]:
mixed_cols = [col for col in df_untidy.columns if df_untidy[col].apply(type).nunique() > 1]
print("Columns with mixed data types:", mixed_cols)
df_clean = df_untidy.copy()
df_clean['Height_cm'] = df_clean['Height_cm'].astype(str).str.replace("cm", "", regex=False)
df_clean['Height_cm'] = pd.to_numeric(df_clean['Height_cm'], errors='coerce')
df_clean['Rating'] = df_clean['Rating'].astype(str).str.replace("Rating: ", "", regex=False)
df_clean['Rating'] = pd.to_numeric(df_clean['Rating'], errors='coerce')

print("\nAfter cleaning data types:")
print(df_clean.dtypes)
print(df_clean.head())


Columns with mixed data types: ['Height_cm', 'Rating']

After cleaning data types:
Score        float64
Height_cm    float64
Rating       float64
Fruit         object
Group         object
IsActive      object
dtype: object
      Score  Height_cm  Rating   Fruit Group IsActive
0 -1.085631        NaN     2.0  banana     D      Yes
1  0.997345  16.480034     5.0   apple     A       No
2  0.282978  49.244711     NaN  banana     B       No
3 -1.506295        NaN     3.0   grape     D      nan
4 -0.578600  31.599083     NaN  banana     C       No


- Q4. Apply scaling and/or normalization techniques (such as Min-Max Scaling and Standardization) to the numerical columns to prepare them for downstream machine learning tasks.

In [9]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
num_cols = ['Score', 'Height_cm', 'Rating']
df_scaled = df_clean.copy()
minmax = MinMaxScaler()
df_scaled[num_cols] = minmax.fit_transform(df_scaled[num_cols])

print("\nAfter Min-Max Scaling:")
print(df_scaled[num_cols].head())
standard = StandardScaler()
df_standard = df_clean.copy()
df_standard[num_cols] = standard.fit_transform(df_standard[num_cols])

print("\nAfter Standardization:")
print(df_standard[num_cols].head())



After Min-Max Scaling:
      Score  Height_cm  Rating
0  0.346613        NaN    0.25
1  0.683137   0.162150    1.00
2  0.567725   0.982630     NaN
3  0.278651        NaN    0.50
4  0.428529   0.540755     NaN

After Standardization:
      Score  Height_cm    Rating
0 -1.074490        NaN -0.736795
1  1.021067  -1.132990  1.424470
2  0.302385   1.692095       NaN
3 -1.497695        NaN -0.016373
4 -0.564397   0.170627       NaN


- Q5. Write a function to check for and report any remaining inconsistencies (missing values, mixed types, out-of-range values) in the cleaned DataFrame. Validate that the preprocessing steps have successfully prepared the data for analysis.

def validate_dataframe(df):
    report = {}

    # missing values
    missing = df.isnull().sum()
    report['Missing Values'] = missing[missing > 0].to_dict()

    #Check mixed data types
    mixed_types = [col for col in df.columns if df[col].apply(type).nunique() > 1]
    report['Mixed Data Types'] = mixed_types

    #Check out-of-range values
    out_of_range = {}
    if 'Height_cm' in df.columns:
        out_of_range['Height_cm'] = df[df['Height_cm'] <= 0]['Height_cm'].count()
    if 'Rating' in df.columns:
        out_of_range['Rating'] = df[(df['Rating'] < 1) | (df['Rating'] > 5)]['Rating'].count()
    if 'Score' in df.columns:
        out_of_range['Score'] = df[(df['Score'] < -4) | (df['Score'] > 4)]['Score'].count()
    report['Out-of-Range Values'] = out_of_range
    if not any(report.values()):
        print("DataFrame passed validation. Ready for analysis!")
    else:
        print("Issues found in DataFrame:")
        for key, value in report.items():
            print(f"{key}: {value}")

    return report
report = validate_dataframe(df_clean)


In [10]:
def validate_dataframe(df):
    report = {}

    #missing values
    missing = df.isnull().sum()
    report['Missing Values'] = missing[missing > 0].to_dict()

    #mixed data types
    mixed_types = [col for col in df.columns if df[col].apply(type).nunique() > 1]
    report['Mixed Data Types'] = mixed_types

    #out-of-range values
    out_of_range = {}
    if 'Height_cm' in df.columns:
        out_of_range['Height_cm'] = df[df['Height_cm'] <= 0]['Height_cm'].count()
    if 'Rating' in df.columns:
        out_of_range['Rating'] = df[(df['Rating'] < 1) | (df['Rating'] > 5)]['Rating'].count()
    if 'Score' in df.columns:
        out_of_range['Score'] = df[(df['Score'] < -4) | (df['Score'] > 4)]['Score'].count()
    report['Out-of-Range Values'] = out_of_range
    if not any(report.values()):
        print("DataFrame passed validation. Ready for analysis!")
    else:
        print("Issues found in DataFrame:")
        for key, value in report.items():
            print(f"{key}: {value}")

    return report
report = validate_dataframe(df_clean)


Issues found in DataFrame:
Missing Values: {'Score': 60, 'Height_cm': 60, 'Rating': 60}
Mixed Data Types: []
Out-of-Range Values: {'Height_cm': np.int64(0), 'Rating': np.int64(0), 'Score': np.int64(0)}
