# Data Ingestion

In [159]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import skew, kurtosis
import seaborn as sns 

df = pd.read_csv("possum.csv")
df

Unnamed: 0,case,site,Pop,sex,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly
0,1,1,Vic,m,8.0,94.1,60.4,89.0,36.0,74.5,54.5,15.2,28.0,36.0
1,2,1,Vic,f,6.0,92.5,57.6,91.5,36.5,72.5,51.2,16.0,28.5,33.0
2,3,1,Vic,f,6.0,94.0,60.0,95.5,39.0,75.4,51.9,15.5,30.0,34.0
3,4,1,Vic,f,6.0,93.2,57.1,92.0,38.0,76.1,52.2,15.2,28.0,34.0
4,5,1,Vic,f,2.0,91.5,56.3,85.5,36.0,71.0,53.2,15.1,28.5,33.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99,100,7,other,m,1.0,89.5,56.0,81.5,36.5,66.0,46.8,14.8,23.0,27.0
100,101,7,other,m,1.0,88.6,54.7,82.5,39.0,64.4,48.0,14.0,25.0,33.0
101,102,7,other,f,6.0,92.4,55.0,89.0,38.0,63.5,45.4,13.0,25.0,30.0
102,103,7,other,m,4.0,91.5,55.2,82.5,36.5,62.9,45.9,15.4,25.0,29.0


# Preliminary Data Analysis

In [160]:
# Print the first 5 rows 
df.head()

Unnamed: 0,case,site,Pop,sex,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly
0,1,1,Vic,m,8.0,94.1,60.4,89.0,36.0,74.5,54.5,15.2,28.0,36.0
1,2,1,Vic,f,6.0,92.5,57.6,91.5,36.5,72.5,51.2,16.0,28.5,33.0
2,3,1,Vic,f,6.0,94.0,60.0,95.5,39.0,75.4,51.9,15.5,30.0,34.0
3,4,1,Vic,f,6.0,93.2,57.1,92.0,38.0,76.1,52.2,15.2,28.0,34.0
4,5,1,Vic,f,2.0,91.5,56.3,85.5,36.0,71.0,53.2,15.1,28.5,33.0


In [161]:
# Print last five rows
df.tail()

Unnamed: 0,case,site,Pop,sex,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly
99,100,7,other,m,1.0,89.5,56.0,81.5,36.5,66.0,46.8,14.8,23.0,27.0
100,101,7,other,m,1.0,88.6,54.7,82.5,39.0,64.4,48.0,14.0,25.0,33.0
101,102,7,other,f,6.0,92.4,55.0,89.0,38.0,63.5,45.4,13.0,25.0,30.0
102,103,7,other,m,4.0,91.5,55.2,82.5,36.5,62.9,45.9,15.4,25.0,29.0
103,104,7,other,f,3.0,93.6,59.9,89.0,40.0,67.6,46.0,14.8,28.5,33.5


In [162]:
# Print information about the dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   case      104 non-null    int64  
 1   site      104 non-null    int64  
 2   Pop       104 non-null    object 
 3   sex       104 non-null    object 
 4   age       102 non-null    float64
 5   hdlngth   104 non-null    float64
 6   skullw    104 non-null    float64
 7   totlngth  104 non-null    float64
 8   taill     104 non-null    float64
 9   footlgth  103 non-null    float64
 10  earconch  104 non-null    float64
 11  eye       104 non-null    float64
 12  chest     104 non-null    float64
 13  belly     104 non-null    float64
dtypes: float64(10), int64(2), object(2)
memory usage: 11.5+ KB


In [163]:
# Print the dimension of the dataframe
df.shape

(104, 14)

In [164]:
# Check for missing values in the dataframe

df.isna()

Unnamed: 0,case,site,Pop,sex,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99,False,False,False,False,False,False,False,False,False,False,False,False,False,False
100,False,False,False,False,False,False,False,False,False,False,False,False,False,False
101,False,False,False,False,False,False,False,False,False,False,False,False,False,False
102,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [165]:
# Check the statistical summary of the records
df.describe()

Unnamed: 0,case,site,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly
count,104.0,104.0,102.0,104.0,104.0,104.0,104.0,103.0,104.0,104.0,104.0,104.0
mean,52.5,3.625,3.833333,92.602885,56.883654,87.088462,37.009615,68.459223,48.130769,15.046154,27.0,32.586538
std,30.166206,2.349086,1.909244,3.573349,3.113426,4.310549,1.959518,4.395306,4.10938,1.050374,2.045597,2.761949
min,1.0,1.0,1.0,82.5,50.0,75.0,32.0,60.3,40.3,12.8,22.0,25.0
25%,26.75,1.0,2.25,90.675,54.975,84.0,35.875,64.6,44.8,14.4,25.5,31.0
50%,52.5,3.0,3.0,92.8,56.35,88.0,37.0,68.0,46.8,14.9,27.0,32.5
75%,78.25,6.0,5.0,94.725,58.1,90.0,38.0,72.5,52.0,15.725,28.0,34.125
max,104.0,7.0,9.0,103.1,68.6,96.5,43.0,77.9,56.2,17.8,32.0,40.0


In [166]:
# Check the features in the dataframe

df.columns

Index(['case', 'site', 'Pop', 'sex', 'age', 'hdlngth', 'skullw', 'totlngth',
       'taill', 'footlgth', 'earconch', 'eye', 'chest', 'belly'],
      dtype='object')

In [167]:
# Check total number of missing values
df.isna().sum()

case        0
site        0
Pop         0
sex         0
age         2
hdlngth     0
skullw      0
totlngth    0
taill       0
footlgth    1
earconch    0
eye         0
chest       0
belly       0
dtype: int64

In [168]:
# Drop missing values
df_age = df['age'].median()

df_footlgth = df['footlgth'].median()

In [169]:
# Fill empty value

df['age'] = df['age'].fillna(df_age)
df["footlgth"] = df['footlgth'].fillna(df_footlgth)


In [170]:
df.isna().sum()

case        0
site        0
Pop         0
sex         0
age         0
hdlngth     0
skullw      0
totlngth    0
taill       0
footlgth    0
earconch    0
eye         0
chest       0
belly       0
dtype: int64

In [171]:
# Check for duplicates in the data
df.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
99     False
100    False
101    False
102    False
103    False
Length: 104, dtype: bool

# Statistical Analysis

#### Measure of Center

In [172]:
# List of numerical features
numerical_features = ['age', 'hdlngth', 'skullw', 'totlngth', 'taill', 'footlgth', 'earconch', 'eye', 'chest', 'belly']
# Define units for numerical features
units = {
    'age': 'years',
    'hdlngth': 'mm',
    'skullw': 'mm',
    'totlngth': 'mm',
    'taill': 'mm',
    'footlgth': 'mm',
    'earconch': 'mm',
    'eye': 'mm',
    'chest': 'mm',
    'belly': 'mm'
}
# Compute mean, median, and mode for each numerical feature using a for loop
for col in numerical_features:
    mean_val = df[col].mean()
    median_val = df[col].median()
    mode_val = df[col].mode().iloc[0]
    unit = units[col]
    print(f"\nThe measure of Center for {col} in {unit} is:")
    print(" ")
    print(f"  Mean : {mean_val:.2f} {unit}")
    print(" ")
    print(f"  Median : {median_val:.2f} {unit}")
    print(" ")
    print(f"  Mode : {mode_val:.2f} {unit}")
print('\n')
# List of categorical feature
categorical_features = ['site', 'Pop', 'sex']
print("\nFrequency analysis for categorical features:")
# Compute mean, median, and mode for each numerical feature using a for loop
for col in categorical_features:
    print(f"\nThe unique value for {col} is:\n", df[col].unique())
    print(f"\nThe frequency distribution for {col} is:\n", df[col].value_counts())



The measure of Center for age in years is:
 
  Mean : 3.82 years
 
  Median : 3.00 years
 
  Mode : 3.00 years

The measure of Center for hdlngth in mm is:
 
  Mean : 92.60 mm
 
  Median : 92.80 mm
 
  Mode : 93.30 mm

The measure of Center for skullw in mm is:
 
  Mean : 56.88 mm
 
  Median : 56.35 mm
 
  Mode : 57.60 mm

The measure of Center for totlngth in mm is:
 
  Mean : 87.09 mm
 
  Median : 88.00 mm
 
  Mode : 89.00 mm

The measure of Center for taill in mm is:
 
  Mean : 37.01 mm
 
  Median : 37.00 mm
 
  Mode : 38.00 mm

The measure of Center for footlgth in mm is:
 
  Mean : 68.45 mm
 
  Median : 68.00 mm
 
  Mode : 73.20 mm

The measure of Center for earconch in mm is:
 
  Mean : 48.13 mm
 
  Median : 46.80 mm
 
  Mode : 44.90 mm

The measure of Center for eye in mm is:
 
  Mean : 15.05 mm
 
  Median : 14.90 mm
 
  Mode : 14.40 mm

The measure of Center for chest in mm is:
 
  Mean : 27.00 mm
 
  Median : 27.00 mm
 
  Mode : 28.00 mm

The measure of Center for belly in mm

#### Measure of Spread


In [173]:
# Compute variance, standard deviation, Range for each numerical feature using a for loop
numerical_features = ['age', 'hdlngth', 'skullw', 'totlngth', 'taill', 'footlgth', 'earconch', 'eye', 'chest', 'belly']
for col in numerical_features:
    min_val = df[col].min()
    max_val = df[col].max()
    range_value  = df[col].max()- df[col].min()
    variance_val = df[col].var()
    std_val = df[col].std()
    q25 = df[col].quantile(0.25)
    q50 = df[col].quantile(0.50)
    q75 = df[col].quantile(0.75)
    iqr_val = q75 - q25
    skewness_val = skew(df[col])
    kurtosis_val = kurtosis(df[col])
    unit = units[col]
    print(f"\nThe measure of spread for {col} in {unit} is:")
    print(" ")
    print(f"  Minimum : {min_val:.2f} {unit}")
    print(" ")
    print(f"  Maximum: {max_val:.2f} {unit}")
    print(" ")
    print(f"  Range: {range_value:.2f} {unit}")
    print(" ")
    print(f"  Variance: {variance_val:.2f} {unit}")
    print(" ")
    print(f"  standard deviation: {variance_val:.2f} {unit}")
    print(" ")
    print(f"  25th Percentile (Q1): {q25:.2f} {unit}")
    print(" ")
    print(f"  50th Percentile (Median/Q2): {q50:.2f} {unit}")
    print(" ")
    print(f"  75th Percentile (Q3): {q75:.2f} {unit}")
    print(" ")
    print(f"  IQR: {iqr_val:.2f} {unit}")
    print(" ")
    print(f"  skewness: {skewness_val:.2f} {unit}")
    # print(" ")
    print(f"  Kurtosis: {kurtosis_val:.2f} {unit}")
   


The measure of spread for age in years is:
 
  Minimum : 1.00 years
 
  Maximum: 9.00 years
 
  Range: 8.00 years
 
  Variance: 3.59 years
 
  standard deviation: 3.59 years
 
  25th Percentile (Q1): 2.75 years
 
  50th Percentile (Median/Q2): 3.00 years
 
  75th Percentile (Q3): 5.00 years
 
  IQR: 2.25 years
 
  skewness: 0.56 years
  Kurtosis: -0.27 years

The measure of spread for hdlngth in mm is:
 
  Minimum : 82.50 mm
 
  Maximum: 103.10 mm
 
  Range: 20.60 mm
 
  Variance: 12.77 mm
 
  standard deviation: 12.77 mm
 
  25th Percentile (Q1): 90.67 mm
 
  50th Percentile (Median/Q2): 92.80 mm
 
  75th Percentile (Q3): 94.72 mm
 
  IQR: 4.05 mm
 
  skewness: -0.06 mm
  Kurtosis: 0.79 mm

The measure of spread for skullw in mm is:
 
  Minimum : 50.00 mm
 
  Maximum: 68.60 mm
 
  Range: 18.60 mm
 
  Variance: 9.69 mm
 
  standard deviation: 9.69 mm
 
  25th Percentile (Q1): 54.98 mm
 
  50th Percentile (Median/Q2): 56.35 mm
 
  75th Percentile (Q3): 58.10 mm
 
  IQR: 3.12 mm
 
  ske

#### Outlier Analysis Using IQR

In [183]:
#Detect and handle outliers in numeric features using IQR
def detect_outliers_iqr(x):
    Q1 = x.quantile(0.25)
    Q3 = x.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = x[(x < lower_bound) | (x > upper_bound)]
    return {
        "outliers": outliers,
        "lower_bound": lower_bound,
        "upper_bound": upper_bound
    }
# Detect and handle outliers in numeric features using Normalization / Standardization
def detect_outliers_zscore(x, threshold=3):
    z_scores = (x - x.mean()) / x.std()
    outliers = x[abs(z_scores) > threshold]
    return {
        "outliers": outliers,
        "z_scores": z_scores,
        "threshold": threshold
    }
# Detect outliers
for col in numerical_features:
    print(f"\nIQR Outlier Detection for {col}")
    iqr_result = detect_outliers_iqr(df[col].dropna())
    print("Lower Bound:", iqr_result["lower_bound"])
    print("Upper Bound:", iqr_result["upper_bound"])
    print("Outliers:\n", iqr_result["outliers"].values)
    print(f"\nZ-Score Outlier Detection for {col}")
    zscore_result = detect_outliers_zscore(df[col].dropna())
    print("Threshold:", zscore_result["threshold"])
    print("Outliers:\n", zscore_result["outliers"].values)









IQR Outlier Detection for age
Lower Bound: -0.625
Upper Bound: 8.375
Outliers:
 [9. 9.]

Z-Score Outlier Detection for age
Threshold: 3
Outliers:
 []

IQR Outlier Detection for hdlngth
Lower Bound: 84.6
Upper Bound: 100.79999999999998
Outliers:
 [103.1 102.5  82.5]

Z-Score Outlier Detection for hdlngth
Threshold: 3
Outliers:
 []

IQR Outlier Detection for skullw
Lower Bound: 50.2875
Upper Bound: 62.7875
Outliers:
 [67.7 63.2 63.  63.2 64.2 62.8 50.  68.6]

Z-Score Outlier Detection for skullw
Threshold: 3
Outliers:
 [67.7 68.6]

IQR Outlier Detection for totlngth
Lower Bound: 75.0
Upper Bound: 99.0
Outliers:
 []

Z-Score Outlier Detection for totlngth
Threshold: 3
Outliers:
 []

IQR Outlier Detection for taill
Lower Bound: 32.6875
Upper Bound: 41.1875
Outliers:
 [32.  32.  43.  41.5]

Z-Score Outlier Detection for taill
Threshold: 3
Outliers:
 [43.]

IQR Outlier Detection for footlgth
Lower Bound: 52.875000000000014
Upper Bound: 84.27499999999999
Outliers:
 []

Z-Score Outlier Detect