# **Possum Dataset Task**

#### **Data Injection**

In [1]:
# Import neccessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import kagglehub
from pathlib import Path

# Loading dataset
dataset = kagglehub.dataset_download("abrambeyer/openintro-possum")
possum = Path('possum.csv')

# Full path
dataset_path = dataset / possum

possum_data = pd.read_csv(dataset_path)

  from .autonotebook import tqdm as notebook_tqdm


#### **Preliminarly Data Analysis (PDA)**

In [2]:
# Checking the overview of the data
possum_data.head()

Unnamed: 0,case,site,Pop,sex,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly
0,1,1,Vic,m,8.0,94.1,60.4,89.0,36.0,74.5,54.5,15.2,28.0,36.0
1,2,1,Vic,f,6.0,92.5,57.6,91.5,36.5,72.5,51.2,16.0,28.5,33.0
2,3,1,Vic,f,6.0,94.0,60.0,95.5,39.0,75.4,51.9,15.5,30.0,34.0
3,4,1,Vic,f,6.0,93.2,57.1,92.0,38.0,76.1,52.2,15.2,28.0,34.0
4,5,1,Vic,f,2.0,91.5,56.3,85.5,36.0,71.0,53.2,15.1,28.5,33.0


In [3]:
possum_data.tail()

Unnamed: 0,case,site,Pop,sex,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly
99,100,7,other,m,1.0,89.5,56.0,81.5,36.5,66.0,46.8,14.8,23.0,27.0
100,101,7,other,m,1.0,88.6,54.7,82.5,39.0,64.4,48.0,14.0,25.0,33.0
101,102,7,other,f,6.0,92.4,55.0,89.0,38.0,63.5,45.4,13.0,25.0,30.0
102,103,7,other,m,4.0,91.5,55.2,82.5,36.5,62.9,45.9,15.4,25.0,29.0
103,104,7,other,f,3.0,93.6,59.9,89.0,40.0,67.6,46.0,14.8,28.5,33.5


In [4]:
possum_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   case      104 non-null    int64  
 1   site      104 non-null    int64  
 2   Pop       104 non-null    object 
 3   sex       104 non-null    object 
 4   age       102 non-null    float64
 5   hdlngth   104 non-null    float64
 6   skullw    104 non-null    float64
 7   totlngth  104 non-null    float64
 8   taill     104 non-null    float64
 9   footlgth  103 non-null    float64
 10  earconch  104 non-null    float64
 11  eye       104 non-null    float64
 12  chest     104 non-null    float64
 13  belly     104 non-null    float64
dtypes: float64(10), int64(2), object(2)
memory usage: 11.5+ KB


In [5]:
possum_data.describe()

Unnamed: 0,case,site,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly
count,104.0,104.0,102.0,104.0,104.0,104.0,104.0,103.0,104.0,104.0,104.0,104.0
mean,52.5,3.625,3.833333,92.602885,56.883654,87.088462,37.009615,68.459223,48.130769,15.046154,27.0,32.586538
std,30.166206,2.349086,1.909244,3.573349,3.113426,4.310549,1.959518,4.395306,4.10938,1.050374,2.045597,2.761949
min,1.0,1.0,1.0,82.5,50.0,75.0,32.0,60.3,40.3,12.8,22.0,25.0
25%,26.75,1.0,2.25,90.675,54.975,84.0,35.875,64.6,44.8,14.4,25.5,31.0
50%,52.5,3.0,3.0,92.8,56.35,88.0,37.0,68.0,46.8,14.9,27.0,32.5
75%,78.25,6.0,5.0,94.725,58.1,90.0,38.0,72.5,52.0,15.725,28.0,34.125
max,104.0,7.0,9.0,103.1,68.6,96.5,43.0,77.9,56.2,17.8,32.0,40.0


In [6]:
# Determine the total number of Animals(Possums)
total_case = possum_data["case"].size
print(f"The total Animals(Possums) before cleaning: {total_case}")

# Total number of female possums
total_female = possum_data["sex"] == 'f'
print(f"The total number of female possum: {total_female.sum()}")

# Total number of male possums
total_male = possum_data["sex"] == 'm'
print(f"The total number of male possum: {total_male.sum()}")


The total Animals(Possums) before cleaning: 104
The total number of female possum: 43
The total number of male possum: 61


#### **Data Cleaning**

In [7]:
# Checking if there are missing values
possum_data.isna().sum()


case        0
site        0
Pop         0
sex         0
age         2
hdlngth     0
skullw      0
totlngth    0
taill       0
footlgth    1
earconch    0
eye         0
chest       0
belly       0
dtype: int64

In [8]:
# Since there are missing values in both column "Age" and "Footlgth, I need to clean it by dropping the null-values"
clean_data = possum_data.dropna(inplace=True)

# For resetting the index
clean_data = possum_data.reset_index(drop=True)


In [9]:
# Rechecking again to see if it has been dropped
clean_data.isna().sum()

case        0
site        0
Pop         0
sex         0
age         0
hdlngth     0
skullw      0
totlngth    0
taill       0
footlgth    0
earconch    0
eye         0
chest       0
belly       0
dtype: int64

In [10]:
# Determine the total number of Animals(Possums) after cleaning
total_case = clean_data["case"].size
print(f"The total Animals(Possums) after cleaning: {total_case}")

# Total number of female possums
total_female = clean_data["sex"] == 'f'
print(f"The total number of female possum: {total_female.sum()}")

# Total number of male possums
total_male = clean_data["sex"] == 'm'
print(f"The total number of male possum: {total_male.sum()}")

The total Animals(Possums) after cleaning: 101
The total number of female possum: 42
The total number of male possum: 59


#### **Descriptive Analysis**

In [11]:
# Taking a snapshot of the data
print(f"Total Entry of Datasets: {len(clean_data)}")


Total Entry of Datasets: 101


In [12]:
clean_data.head()

Unnamed: 0,case,site,Pop,sex,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly
0,1,1,Vic,m,8.0,94.1,60.4,89.0,36.0,74.5,54.5,15.2,28.0,36.0
1,2,1,Vic,f,6.0,92.5,57.6,91.5,36.5,72.5,51.2,16.0,28.5,33.0
2,3,1,Vic,f,6.0,94.0,60.0,95.5,39.0,75.4,51.9,15.5,30.0,34.0
3,4,1,Vic,f,6.0,93.2,57.1,92.0,38.0,76.1,52.2,15.2,28.0,34.0
4,5,1,Vic,f,2.0,91.5,56.3,85.5,36.0,71.0,53.2,15.1,28.5,33.0


In [13]:
# Description for each sex by age
print(clean_data.groupby('sex')["age"].describe())

     count      mean       std  min  25%  50%  75%  max
sex                                                    
f     42.0  3.952381  1.962482  1.0  3.0  3.5  5.0  9.0
m     59.0  3.728814  1.892185  1.0  2.0  3.0  5.0  8.0


In [14]:
# Description for each sex by skullw
print(clean_data.groupby('sex')["skullw"].describe())

     count       mean       std   min    25%    50%     75%   max
sex                                                              
f     42.0  56.578571  2.599112  51.5  55.10  56.35  57.675  67.7
m     59.0  57.232203  3.411969  50.0  55.05  56.60  59.000  68.6


In [15]:
# Description for each sex by footlgth
print(clean_data.groupby('sex')["footlgth"].describe())

     count       mean       std   min    25%    50%   75%   max
sex                                                            
f     42.0  69.111905  4.911321  60.3  64.85  70.45  72.8  77.9
m     59.0  67.889831  3.987683  62.0  64.50  66.50  71.5  75.0


In [16]:
# Description for each sex by belly
print(clean_data.groupby('sex')["belly"].describe())

     count       mean       std   min    25%   50%   75%   max
sex                                                           
f     42.0  32.940476  2.940886  25.0  31.50  33.0  34.0  40.0
m     59.0  32.423729  2.569405  27.0  30.75  32.0  35.0  38.0


###### **Measures of Center**

In [17]:
print('\n~~~~ VISUAL PRESENTATION ~~~~')
for gender in ["f", "m"]:
    gender_data = clean_data[clean_data["sex"] == gender]
    
    print(f"\nGender({gender}) (n={len(gender_data)}):")
    print("  Age:")
    print(f"    Mean: {gender_data['age'].mean():.1f} years")
    print(f"    Median: {gender_data['age'].median():.1f} years")
    print(f"    Standard Deviation: {gender_data['age'].std():.1f} years")
    
    print("  Skull-Width:")
    print(f"    Mean: {gender_data['skullw'].mean():.1f} mm")
    print(f"    Median: {gender_data['skullw'].median():.1f} mm")
    print(f"    Standard Deviation: {gender_data['skullw'].std():.1f} mm")
    
    print("  Foot-Length:")
    print(f"    Mean: {gender_data['footlgth'].mean():.1f} cm")
    print(f"    Median: {gender_data['footlgth'].median():.1f} cm")
    print(f"    Range: {gender_data['footlgth'].min()} - {gender_data['footlgth'].max()} cm")


~~~~ VISUAL PRESENTATION ~~~~

Gender(f) (n=42):
  Age:
    Mean: 4.0 years
    Median: 3.5 years
    Standard Deviation: 2.0 years
  Skull-Width:
    Mean: 56.6 mm
    Median: 56.3 mm
    Standard Deviation: 2.6 mm
  Foot-Length:
    Mean: 69.1 cm
    Median: 70.4 cm
    Range: 60.3 - 77.9 cm

Gender(m) (n=59):
  Age:
    Mean: 3.7 years
    Median: 3.0 years
    Standard Deviation: 1.9 years
  Skull-Width:
    Mean: 57.2 mm
    Median: 56.6 mm
    Standard Deviation: 3.4 mm
  Foot-Length:
    Mean: 67.9 cm
    Median: 66.5 cm
    Range: 62.0 - 75.0 cm
