## Fish regression analysis



In [1]:
# imports

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import statsmodels.api as sm

from sklearn.model_selection import train_test_split


In [2]:
# import dataset from gustavos github

username = "datagus"
repository = "ASDA2025"
directory = "datasets/Fish.csv"
github_url = f"https://raw.githubusercontent.com/{username}/{repository}/main/{directory}"
df = pd.read_csv(github_url)

## Descriptive Analysis and

In [3]:
df.head()

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
0,Bream,242.0,23.2,25.4,30.0,11.52,4.02
1,Bream,290.0,24.0,26.3,31.2,12.48,4.3056
2,Bream,340.0,23.9,26.5,31.1,12.3778,4.6961
3,Bream,363.0,26.3,29.0,33.5,12.73,4.4555
4,Bream,430.0,26.5,29.0,34.0,12.444,5.134


In [4]:

print("=== Dataset Overview ===")
print("Rows:", df.shape[0])
print("Columns:", df.shape[1])
print("\nColumn names:", list(df.columns))
print("\nDataFrame info:")
df.info()

=== Dataset Overview ===
Rows: 159
Columns: 7

Column names: ['Species', 'Weight', 'Length1', 'Length2', 'Length3', 'Height', 'Width']

DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Species  159 non-null    object 
 1   Weight   159 non-null    float64
 2   Length1  159 non-null    float64
 3   Length2  159 non-null    float64
 4   Length3  159 non-null    float64
 5   Height   159 non-null    float64
 6   Width    159 non-null    float64
dtypes: float64(6), object(1)
memory usage: 8.8+ KB


In [7]:

for col in df.columns:
    structure.append({
        "feature": col,
        "dtype": str(df[col].dtype),
        "description": "—",   # you fill manually in the report
        "n_unique": df[col].nunique(),
        "example_values": df[col].unique()[:3]
    })

import pandas as pd
structure_df = pd.DataFrame(structure)
structure_df

Unnamed: 0,feature,dtype,description,n_unique,example_values
0,Species,object,—,7,"[Bream, Roach, Whitefish]"
1,Weight,float64,—,101,"[242.0, 290.0, 340.0]"
2,Length1,float64,—,116,"[23.2, 24.0, 23.9]"
3,Length2,float64,—,93,"[25.4, 26.3, 26.5]"
4,Length3,float64,—,124,"[30.0, 31.2, 31.1]"
5,Height,float64,—,154,"[11.52, 12.48, 12.3778]"
6,Width,float64,—,152,"[4.02, 4.3056, 4.6961]"
7,Species,object,—,7,"[Bream, Roach, Whitefish]"
8,Weight,float64,—,101,"[242.0, 290.0, 340.0]"
9,Length1,float64,—,116,"[23.2, 24.0, 23.9]"


In [9]:
# Missing values
print("\nMissing values per column:")
print(df.isna().sum())

# Duplicates
print("\nNumber of duplicated rows:", df.duplicated().sum())



Missing values per column:
Species    0
Weight     0
Length1    0
Length2    0
Length3    0
Height     0
Width      0
dtype: int64

Number of duplicated rows: 0


In [10]:
# Descriptive statistics for numeric columns

print("\n=== Descriptive Statistics — Numeric Columns ===")
numeric_stats = df.describe().T
numeric_stats



=== Descriptive Statistics — Numeric Columns ===


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Weight,159.0,398.326415,357.978317,0.0,120.0,273.0,650.0,1650.0
Length1,159.0,26.24717,9.996441,7.5,19.05,25.2,32.7,59.0
Length2,159.0,28.415723,10.716328,8.4,21.0,27.3,35.5,63.4
Length3,159.0,31.227044,11.610246,8.8,23.15,29.4,39.65,68.0
Height,159.0,8.970994,4.286208,1.7284,5.9448,7.786,12.3659,18.957
Width,159.0,4.417486,1.685804,1.0476,3.38565,4.2485,5.5845,8.142


In [11]:
# Descriptive statistics for categorical columns

print("\n=== Descriptive Statistics — Categorical Columns ===")

cat_stats = []
for col in categorical_cols:
    value_counts = df[col].value_counts()
    cat_stats.append({
        "column": col,
        "count": len(df[col]),
        "n_unique": df[col].nunique(),
        "most_frequent": value_counts.idxmax(),
        "most_frequent_count": value_counts.max(),
        "least_frequent": value_counts.idxmin(),
        "least_frequent_count": value_counts.min(),
    })

cat_stats_df = pd.DataFrame(cat_stats)
cat_stats_df


=== Descriptive Statistics — Categorical Columns ===


Unnamed: 0,column,count,n_unique,most_frequent,most_frequent_count,least_frequent,least_frequent_count
0,Species,159,7,Perch,56,Whitefish,6
