# **Possum Dataset Task**

#### **Data Injection**

In [45]:
# Import neccessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import kagglehub
from pathlib import Path

# Loading dataset
dataset = kagglehub.dataset_download("abrambeyer/openintro-possum")
possum = Path('possum.csv')

# Full path
dataset_path = dataset / possum

possum_data = pd.read_csv(dataset_path)

  from .autonotebook import tqdm as notebook_tqdm


Downloading from https://www.kaggle.com/api/v1/datasets/download/abrambeyer/openintro-possum?dataset_version_number=1...


100%|██████████| 2.15k/2.15k [00:00<00:00, 2.64MB/s]

Extracting files...





#### **Preliminarly Data Analysis (PDA)**

In [48]:
# Checking the overview of the data
possum_data.head()

Unnamed: 0,case,site,Pop,sex,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly
0,1,1,Vic,m,8.0,94.1,60.4,89.0,36.0,74.5,54.5,15.2,28.0,36.0
1,2,1,Vic,f,6.0,92.5,57.6,91.5,36.5,72.5,51.2,16.0,28.5,33.0
2,3,1,Vic,f,6.0,94.0,60.0,95.5,39.0,75.4,51.9,15.5,30.0,34.0
3,4,1,Vic,f,6.0,93.2,57.1,92.0,38.0,76.1,52.2,15.2,28.0,34.0
4,5,1,Vic,f,2.0,91.5,56.3,85.5,36.0,71.0,53.2,15.1,28.5,33.0


In [4]:
possum_data.tail()

Unnamed: 0,case,site,Pop,sex,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly
99,100,7,other,m,1.0,89.5,56.0,81.5,36.5,66.0,46.8,14.8,23.0,27.0
100,101,7,other,m,1.0,88.6,54.7,82.5,39.0,64.4,48.0,14.0,25.0,33.0
101,102,7,other,f,6.0,92.4,55.0,89.0,38.0,63.5,45.4,13.0,25.0,30.0
102,103,7,other,m,4.0,91.5,55.2,82.5,36.5,62.9,45.9,15.4,25.0,29.0
103,104,7,other,f,3.0,93.6,59.9,89.0,40.0,67.6,46.0,14.8,28.5,33.5


In [14]:
possum_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   case      104 non-null    int64  
 1   site      104 non-null    int64  
 2   Pop       104 non-null    object 
 3   sex       104 non-null    object 
 4   age       102 non-null    float64
 5   hdlngth   104 non-null    float64
 6   skullw    104 non-null    float64
 7   totlngth  104 non-null    float64
 8   taill     104 non-null    float64
 9   footlgth  103 non-null    float64
 10  earconch  104 non-null    float64
 11  eye       104 non-null    float64
 12  chest     104 non-null    float64
 13  belly     104 non-null    float64
dtypes: float64(10), int64(2), object(2)
memory usage: 11.5+ KB


In [17]:
# Determine the total number of Animals(Possums)
total_case = possum_data["case"].size

# Total number of female possums
total_female = possum_data["sex"] == 'f'
total_female.sum()

# Total number of female possums
total_male = possum_data["sex"] == 'm'
total_male.sum()
# print(f"The total Animals(Possums) before cleaning: {total_case}")


np.int64(61)

#### **Data Cleaning**

In [20]:
# Checking if there are mising values
possum_data.isna().sum()


case        0
site        0
Pop         0
sex         0
age         2
hdlngth     0
skullw      0
totlngth    0
taill       0
footlgth    1
earconch    0
eye         0
chest       0
belly       0
dtype: int64

In [21]:
# Since there are missing values in both column "Age" and "Footlength, I need to clean it by dropping the null-values"
clean_data = possum_data.dropna(inplace=True)

# For resetting the index
clean_data = possum_data.reset_index(drop=True)


In [22]:
# Rechecking again to see if it has been dropped
clean_data.isna().sum()

case        0
site        0
Pop         0
sex         0
age         0
hdlngth     0
skullw      0
totlngth    0
taill       0
footlgth    0
earconch    0
eye         0
chest       0
belly       0
dtype: int64

The total Animals(Possums): 101


#### **Descriptive Analysis**

In [23]:
# Taking a snapshot of the data
print(f"Total Entry of Datasets: {len(clean_data)}")


Total Entry of Datasets: 101


In [26]:
sample_data = clean_data[["sex", "age"]]
sample_data

Unnamed: 0,sex,age
0,m,8.0
1,f,6.0
2,f,6.0
3,f,6.0
4,f,2.0
...,...,...
96,m,1.0
97,m,1.0
98,f,6.0
99,m,4.0


In [44]:
f_skull_width = clean_data[clean_data["sex"] == "f"]["skullw"]
f_skull_width


1      57.6
2      60.0
3      57.1
4      56.3
5      54.8
7      57.6
8      56.3
9      58.0
10     57.2
11     55.6
16     67.7
18     55.4
19     56.3
20     58.1
22     56.1
26     54.5
28     56.0
29     54.4
31     56.7
36     54.8
38     51.5
39     55.0
41     55.5
46     56.4
47     59.6
49     58.1
53     59.4
56     57.7
58     58.0
59     56.4
60     56.5
61     57.4
62     55.8
66     56.4
70     52.0
78     54.0
79     53.8
82     53.2
84     58.0
95     56.2
98     55.0
100    59.9
Name: skullw, dtype: float64

In [None]:
f_skull_width.describe()

count    42.000000
mean     56.578571
std       2.599112
min      51.500000
25%      55.100000
50%      56.350000
75%      57.675000
max      67.700000
Name: skullw, dtype: float64