```
Hello @channel
Your task is to perform statistical analysis using this dataset.
https://www.kaggle.com/datasets/abrambeyer/openintro-possum

You are to document in detail, the interpretation of your results from all the measures of centre, all measures of spread,  and outlier detection mechanisms.
Do not do anything on Probability distribution.
Please be intuitive about the statistical insight and information you can get from your data.
You are to submit the assignment as a link to your github repository (put the .ipynb on your github) here
https://docs.google.com/spreadsheets/d/1m7bYdoATWgj2ZWUZZkxbaGe5Mmp84KjONoaDdOsYpgw/edit?usp=sharing under project 1. (edited)
```

In [188]:
import pandas as pd
from pathlib import Path

pd.set_option('display.float_format', '{:.4f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 25)
df = pd.read_csv('possum.csv')
df.head()

Unnamed: 0,case,site,Pop,sex,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly
0,1,1,Vic,m,8.0,94.1,60.4,89.0,36.0,74.5,54.5,15.2,28.0,36.0
1,2,1,Vic,f,6.0,92.5,57.6,91.5,36.5,72.5,51.2,16.0,28.5,33.0
2,3,1,Vic,f,6.0,94.0,60.0,95.5,39.0,75.4,51.9,15.5,30.0,34.0
3,4,1,Vic,f,6.0,93.2,57.1,92.0,38.0,76.1,52.2,15.2,28.0,34.0
4,5,1,Vic,f,2.0,91.5,56.3,85.5,36.0,71.0,53.2,15.1,28.5,33.0


# Measures of Center and Spread

## Center: Mean, Median, Mode(s)-if more than one
## Spread: Range, Variance, Standard Deviation, Q1, Q3, IQR, Skewness, and Kurtosis

### Columns of Interest: All Numerical Columns


In [189]:
df.select_dtypes(include=['number']).columns.tolist()  #all numerical columns

['case',
 'site',
 'age',
 'hdlngth',
 'skullw',
 'totlngth',
 'taill',
 'footlgth',
 'earconch',
 'eye',
 'chest',
 'belly']

In [190]:
import numpy as np

numeric_columns = df.select_dtypes(include=['number']).columns.tolist()

In [191]:
summary = {}
for column in numeric_columns:
    new_dataframe = df[column].dropna()  # Clean Data
    summary[column] = {
        'mean': float(new_dataframe.mean()),
        'median': float(new_dataframe.median()),
        'mode': new_dataframe.mode().tolist(),
        'min': float(new_dataframe.min()),
        'max': float(new_dataframe.max()),
        'range': float(new_dataframe.max() - new_dataframe.min()),
        'variance': float(new_dataframe.var()),
        'standard_deviation': float(new_dataframe.std()),
        'q1': float(new_dataframe.quantile(0.25)),
        'q2': float(new_dataframe.quantile(0.50)),
        'q3': float(new_dataframe.quantile(0.75)),
        'q4': float(new_dataframe.quantile(1.00)),
        'iqr': float(new_dataframe.quantile(0.75) - new_dataframe.quantile(0.25)),
        'skewness': float(new_dataframe.skew()),
        'kurtosis': float(new_dataframe.kurtosis())
    }

pd.DataFrame(summary)

Unnamed: 0,case,site,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly
mean,52.5000,3.6250,3.8333,92.6029,56.8837,87.0885,37.0096,68.4592,48.1308,15.0462,27.0000,32.5865
median,52.5000,3.0000,3.0000,92.8000,56.3500,88.0000,37.0000,68.0000,46.8000,14.9000,27.0000,32.5000
mode,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",[1],[3.0],[93.3],[57.6],[89.0],[38.0],[73.2],"[44.9, 46.0, 46.8, 52.0]","[14.4, 14.5]",[28.0],"[32.0, 33.0]"
min,1.0000,1.0000,1.0000,82.5000,50.0000,75.0000,32.0000,60.3000,40.3000,12.8000,22.0000,25.0000
max,104.0000,7.0000,9.0000,103.1000,68.6000,96.5000,43.0000,77.9000,56.2000,17.8000,32.0000,40.0000
range,103.0000,6.0000,8.0000,20.6000,18.6000,21.5000,11.0000,17.6000,15.9000,5.0000,10.0000,15.0000
variance,910.0000,5.5182,3.6452,12.7688,9.6934,18.5808,3.8397,19.3187,16.8870,1.1033,4.1845,7.6284
standard_deviation,30.1662,2.3491,1.9092,3.5733,3.1134,4.3105,1.9595,4.3953,4.1094,1.0504,2.0456,2.7619
q1,26.7500,1.0000,2.2500,90.6750,54.9750,84.0000,35.8750,64.6000,44.8000,14.4000,25.5000,31.0000
q2,52.5000,3.0000,3.0000,92.8000,56.3500,88.0000,37.0000,68.0000,46.8000,14.9000,27.0000,32.5000


# IQR DETECTION

**IQR method** (values below Q1 - 1.5×IQR or above Q3 + 1.5×IQR)

In [192]:

outlier_summary = {}
for column in numeric_columns:
    new_dataframe = df[column].dropna()  # Clean data
    q1 = new_dataframe.quantile(0.25)
    q3 = new_dataframe.quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    is_outlier = (new_dataframe < lower) | (new_dataframe > upper)
    outlier_values = new_dataframe[is_outlier].astype('str').tolist()

    outlier_summary[column] = {
        'iqr_outliers_count': int(is_outlier.sum()),
        'outlier_values': ', '.join(outlier_values) if len(outlier_values) > 0 else 'NONE',
    }

pd.DataFrame(outlier_summary).T

Unnamed: 0,iqr_outliers_count,outlier_values
case,0,NONE
site,0,NONE
age,0,NONE
hdlngth,3,"103.1, 102.5, 82.5"
skullw,8,"67.7, 63.2, 63.0, 63.2, 64.2, 62.8, 50.0, 68.6"
totlngth,0,NONE
taill,4,"32.0, 32.0, 43.0, 41.5"
footlgth,0,NONE
earconch,0,NONE
eye,1,17.8
