In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
# Loading the dataset
def load_data(filepath='7.csv'):
    df = pd.read_csv(filepath)
    
    # Basic dataset information
    print("Dataset Overview:")
    print("-----------------")
    print(f"Total Samples: {len(df)}")
    print(f"Columns: {list(df.columns)}")
    print("\nMissing Values:")
    print(df.isnull().sum())
    
    # Summary statistics
    print("\nSummary Statistics:")
    print(df.describe())
    
    # Species distribution
    print("\nSpecies Distribution:")
    print(df['Species'].value_counts())
    
    return df

In [3]:
# Data preprocessing
def preprocess_data(df):
    """
    Perform data preprocessing steps
    """
    # Remove any rows with missing values
    df_cleaned = df.dropna()
    
    # Add W/L ratio column
    df_cleaned['W_L_Ratio'] = df_cleaned['Weight'] / df_cleaned['Length']
    
    return df_cleaned

In [4]:
# Length-Weight Relationship Analysis
def length_weight_regression(df):
    """
    Perform length-weight regression for each species
    """
    results = {}
    
    for species in df['Species'].unique():
        species_data = df[df['Species'] == species]
        
        # Log-transformed regression
        x = np.log(species_data['Length'])
        y = np.log(species_data['Weight'])
        
        slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
        
        results[species] = {
            'a': np.exp(intercept),  # Back-transform intercept
            'b': slope,
            'r_squared': r_value**2,
            'p_value': p_value
        }
    
    # Create results DataFrame
    results_df = pd.DataFrame.from_dict(results, orient='index')
    results_df.index.name = 'Species'
    results_df.reset_index(inplace=True)
    
    print("\nLength-Weight Regression Results:")
    print(results_df)
    
    return results_df

In [5]:
# Visualization functions
def create_visualizations(df):
    """
    Create comprehensive visualizations
    """
    plt.figure(figsize=(15, 10))
    
    # 1. Length-Weight Scatter Plot
    plt.subplot(2, 2, 1)
    sns.scatterplot(data=df, x='Length', y='Weight', hue='Species')
    plt.title('Length vs Weight by Species')
    
    # 2. Box Plot of Weights
    plt.subplot(2, 2, 2)
    sns.boxplot(data=df, x='Species', y='Weight')
    plt.title('Weight Distribution by Species')
    plt.xticks(rotation=45)
    
    # 3. Histogram of Lengths
    plt.subplot(2, 2, 3)
    sns.histplot(data=df, x='Length', hue='Species', multiple='stack')
    plt.title('Length Distribution by Species')
    
    # 4. W/L Ratio Comparison
    plt.subplot(2, 2, 4)
    sns.boxplot(data=df, x='Species', y='W_L_Ratio')
    plt.title('Weight/Length Ratio by Species')
    plt.xticks(rotation=45)
    
    plt.tight_layout()
    plt.savefig('fish_analysis_visualizations.png')
    plt.close()

In [6]:
# Classification Model
def build_classification_model(df):
    """
    Build a Random Forest Classifier to predict species
    """
    # Prepare features and target
    X = df[['Length', 'Weight', 'W_L_Ratio']]
    y = df['Species']
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Train Random Forest Classifier
    rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_classifier.fit(X_train_scaled, y_train)
    
    # Predictions
    y_pred = rf_classifier.predict(X_test_scaled)
    
    # Model Evaluation
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    # Feature Importance
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': rf_classifier.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("\nFeature Importance:")
    print(feature_importance)
    
    return rf_classifier, feature_importance

In [7]:
# Main execution
def main():
    # Load and preprocess data
    df = load_data()
    df_cleaned = preprocess_data(df)
    
    # Perform analysis
    regression_results = length_weight_regression(df_cleaned)
    create_visualizations(df_cleaned)
    classification_model, feature_importance = build_classification_model(df_cleaned)

In [8]:
if __name__ == '__main__':
    main()

Dataset Overview:
-----------------
Total Samples: 4080
Columns: ['species', 'length', 'weight', 'w_l_ratio']

Missing Values:
species      0
length       0
weight       0
w_l_ratio    0
dtype: int64

Summary Statistics:
            length       weight    w_l_ratio
count  4080.000000  4080.000000  4080.000000
mean     17.353544     3.739875     0.252782
std       7.114684     1.040365     0.123046
min       6.360000     2.050000     0.080000
25%      11.327500     3.070000     0.170000
50%      17.350000     3.310000     0.190000
75%      22.585000     4.100000     0.340000
max      33.860000     6.290000     0.640000

Species Distribution:


KeyError: 'Species'