<a href="https://colab.research.google.com/github/Vaishnavi4018/tree_species_classification.py/blob/main/tree_species_classification_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the dataset (replace 'tree_data.csv' with your downloaded dataset file)
data = pd.read_csv('tree_data.csv')

# Basic data exploration
print("Dataset Head:")
print(data.head())
print("\nDataset Summary:")
print(data.describe())

# Assuming columns: height, diameter, leaf_type, species (target)
# Data cleaning
data = data.dropna()
print("\nDataset after cleaning missing values:")
print(data.head())

# Features and target
X = data[['height', 'diameter', 'leaf_type']]
y = data['species']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a decision tree classifier
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.2f}")

# Simple visualization (bar chart of species distribution)
data['species'].value_counts().plot(kind='bar')
plt.title('Tree Species Distribution')
plt.xlabel('Species')
plt.ylabel('Count')
plt.show()

# Summary of improvements
print("\nImprovements: Implemented data cleaning to handle missing values, added a decision tree classifier for species classification, and created a bar chart to visualize species distribution for better insight.")