In [11]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
import zipfile
import urllib.request


In [12]:
# URL for the Avocado dataset in zip format
zip_url = "https://github.com/dsrscientist/Data-Science-ML-Capstone-Projects/raw/master/avocado.csv.zip"

# File path to save the downloaded zip file
zip_file_path = "avocado.zip"

# Download the zip file
urllib.request.urlretrieve(zip_url, zip_file_path)

# Extract the contents of the zip file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall()

# Load the dataset into a DataFrame
avocado_data = pd.read_csv("avocado.csv")



In [14]:
# Display the first few rows of the DataFrame
avocado_data.head()

In [15]:
# Display information about the dataset
avocado_data.info()

# Display summary statistics
avocado_data.describe()

# Check for missing values
avocado_data.isnull().sum()

In [16]:
# Example: Classification using Logistic Regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Select features and target variable
X_classification = avocado_data[['AveragePrice', 'Total Volume', '4046', '4225', '4770']]
y_classification = avocado_data['type']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_classification, y_classification, test_size=0.2, random_state=42)

# Initialize and train the model
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_report_result = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:\n", classification_report_result)


In [17]:
# Example: Regression using Linear Regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Select features and target variable
X_regression = avocado_data[['Total Volume', '4046', '4225', '4770']]
y_regression = avocado_data['AveragePrice']

# Split the data into training and testing sets
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_regression, y_regression, test_size=0.2, random_state=42)

# Initialize and train the regression model
regressor = LinearRegression()
regressor.fit(X_train_reg, y_train_reg)

# Make predictions on the test set
y_pred_reg = regressor.predict(X_test_reg)

# Evaluate the model
mae = mean_absolute_error(y_test_reg, y_pred_reg)
mse = mean_squared_error(y_test_reg, y_pred_reg)
r2 = r2_score(y_test_reg, y_pred_reg)

print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")
