In [12]:
import numpy as np
import pandas as pd

# Load the dataset
file_path = 'Species.csv'  # Adjust to the correct file path
data = pd.read_csv(file_path)

data.head()

Unnamed: 0.1,Unnamed: 0,Species Name,Common Name,Type,Location(s),Estimated Population,Threats
0,0,Abies beshanzuensis,Baishan fir,Plant (Tree),"Baishanzu Mountain, Zhejiang, China",5 mature individuals,"Agriculture, fire"
1,1,Actinote zikani,-,Insect,"Near São Paulo, Atlantic forest, Brazil",Unknown,Habitat loss from human expansion
2,2,Aipysurus foliosquama,Leaf scaled sea-snake,Reptile,"Ashmore Reef and Hibernia Reef, Timor Sea",Unknown,Unknown—probably degradation of coral reef hab...
3,3,Amanipodagrion gilliesi,Amani flatwing,Insect,"Amani-Sigi Forest, Usambara Mountains, Tanzania",< 500 individuals,"Population pressure, water pollution"
4,4,Antisolabis seychellensis,-,Insect,"Morne Blanc, Mahé island, Seychelles",Unknown,"Invasive species, climate change"


In [13]:

# Data Cleaning and Preprocessing
def clean_population(pop):
    try:
        if "Unknown" in pop:
            return np.nan  # Treat unknown as missing
        elif "–" in pop:  # Handle ranges like "70–400"
            low, high = map(int, pop.split("–"))
            return (low + high) // 2  # Return midpoint
        elif "<" in pop:
            return int(pop.split()[1]) // 2  # Estimate midpoint for ranges like "< 500"
        elif ">" in pop:
            return int(pop.split()[1]) * 2  # Double the lower bound for ">" cases
        elif "mature individuals" in pop or "individuals" in pop:
            return int(pop.split()[0])  # Extract number from phrases like "5 mature individuals"
        else:
            return np.nan  # Handle other ambiguous cases
    except Exception:
        return np.nan  # Handle unexpected formats gracefully


In [14]:

data['Estimated Population'] = data['Estimated Population'].apply(clean_population)

# Fill missing values in 'Threats'
data['Threats'] = data['Threats'].fillna("Unknown")

# Encode 'Threats' using simple keyword presence
keywords = ['habitat loss', 'invasive species', 'pollution', 'climate change', 'fire', 'population pressure']
for keyword in keywords:
    data[keyword] = data['Threats'].apply(lambda x: 1 if keyword in x.lower() else 0)

# One-hot encode 'Type'
types = data['Type'].unique()
for t in types:
    data[t] = data['Type'].apply(lambda x: 1 if x == t else 0)


data.head()

Unnamed: 0.1,Unnamed: 0,Species Name,Common Name,Type,Location(s),Estimated Population,Threats,habitat loss,invasive species,pollution,...,Fungi,Plant (Flower),Mollusc,Mollusc (Land snail),Crustacean,Mollusc (Snail),Insect (Damselfly),Insect (Butterfly),Spider,Plant (Orchid)
0,0,Abies beshanzuensis,Baishan fir,Plant (Tree),"Baishanzu Mountain, Zhejiang, China",5.0,"Agriculture, fire",0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,Actinote zikani,-,Insect,"Near São Paulo, Atlantic forest, Brazil",,Habitat loss from human expansion,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,Aipysurus foliosquama,Leaf scaled sea-snake,Reptile,"Ashmore Reef and Hibernia Reef, Timor Sea",,Unknown—probably degradation of coral reef hab...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,Amanipodagrion gilliesi,Amani flatwing,Insect,"Amani-Sigi Forest, Usambara Mountains, Tanzania",250.0,"Population pressure, water pollution",0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,4,Antisolabis seychellensis,-,Insect,"Morne Blanc, Mahé island, Seychelles",,"Invasive species, climate change",0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:

# Prepare features and targets
features = keywords + list(types)
X = data[features].fillna(0).values  # Features
y_regression = data['Estimated Population'].fillna(0).values  # Regression target

# Split data into training and testing sets (80-20 split)
train_size = int(0.8 * len(data))
X_train, X_test = X[:train_size], X[train_size:]
y_reg_train, y_reg_test = y_regression[:train_size], y_regression[train_size:]


print(X_train,X_test)

[[0 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]] [[0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
 [1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0

In [16]:

# Regression Model
class LinearRegressionManual:
    def __init__(self):
        self.weights = None
        self.bias = None

    def fit(self, X, y, lr=0.01, epochs=1000):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0

        for _ in range(epochs):
            # Predict
            y_pred = np.dot(X, self.weights) + self.bias

            # Compute gradients
            dw = -(2 / n_samples) * np.dot(X.T, (y - y_pred))
            db = -(2 / n_samples) * np.sum(y - y_pred)

            # Update parameters
            self.weights -= lr * dw
            self.bias -= lr * db

    def predict(self, X):
        return np.dot(X, self.weights) + self.bias



In [17]:

# Train Regression Model
reg_model = LinearRegressionManual()
reg_model.fit(X_train, y_reg_train, lr=0.01, epochs=1000)

# Predictions
y_reg_pred = reg_model.predict(X_test)

# Evaluate Regression Model (Mean Squared Error)
mse = np.mean((y_reg_pred - y_reg_test) ** 2)
print("Regression Model - Mean Squared Error:", mse)

# Display a few predictions for review
print("Sample Predictions:")
for actual, predicted in zip(y_reg_test[:10], y_reg_pred[:10]):
    print(f"Actual: {actual}, Predicted: {predicted:.2f}")


Regression Model - Mean Squared Error: 1766.9658198799566
Sample Predictions:
Actual: 0.0, Predicted: 43.12
Actual: 50.0, Predicted: 101.69
Actual: 0.0, Predicted: 5.93
Actual: 0.0, Predicted: 47.36
Actual: 0.0, Predicted: 15.22
Actual: 0.0, Predicted: 47.36
Actual: 0.0, Predicted: 45.68
Actual: 0.0, Predicted: 22.76
Actual: 3.0, Predicted: 60.19
Actual: 50.0, Predicted: 47.36
