In [1]:
#Avocado Project

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Load the dataset
url = "https://github.com/FlipRoboTechnologies/ML_-Datasets/raw/main/Avocado/avocado.csv.zip"
avocado_data = pd.read_csv(url, compression='zip')

# Display the first few rows of the dataset
print(avocado_data.head())

# Exploratory Data Analysis (EDA)
print(avocado_data.info())
print(avocado_data.describe())

# Data preprocessing
# Convert 'Date' column to datetime format
avocado_data['Date'] = pd.to_datetime(avocado_data['Date'])

# Encode categorical variable 'type'
label_encoder = LabelEncoder()
avocado_data['type_encoded'] = label_encoder.fit_transform(avocado_data['type'])

# Regression task: Predict average price
# Features for regression
X_reg = avocado_data[['Total Volume', '4046', '4225', '4770', 'type_encoded']]
y_reg = avocado_data['AveragePrice']

# Split the data into training and testing sets for regression task
X_reg_train, X_reg_test, y_reg_train, y_reg_test = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

# Initialize and train Linear Regression model
reg_model = LinearRegression()
reg_model.fit(X_reg_train, y_reg_train)

# Make predictions on the testing data for regression task
y_reg_pred = reg_model.predict(X_reg_test)

# Calculate evaluation metrics for regression task
reg_mae = mean_absolute_error(y_reg_test, y_reg_pred)
reg_mse = mean_squared_error(y_reg_test, y_reg_pred)
reg_rmse = np.sqrt(reg_mse)
print("Regression MAE:", reg_mae)
print("Regression RMSE:", reg_rmse)

# Classification task: Predict region
# Features for classification
X_cls = avocado_data[['Total Volume', '4046', '4225', '4770', 'AveragePrice', 'type_encoded']]
y_cls = avocado_data['Region']

# Split the data into training and testing sets for classification task
X_cls_train, X_cls_test, y_cls_train, y_cls_test = train_test_split(X_cls, y_cls, test_size=0.2, random_state=42)

# Initialize and train Random Forest Classifier
cls_model = RandomForestClassifier(n_estimators=100, random_state=42)
cls_model.fit(X_cls_train, y_cls_train)

# Make predictions on the testing data for classification task
y_cls_pred = cls_model.predict(X_cls_test)

# Calculate evaluation metrics for classification task
cls_accuracy = accuracy_score(y_cls_test, y_cls_pred)
print("Classification Accuracy:", cls_accuracy)

# Print classification report and confusion matrix
print(classification_report(y_cls_test, y_cls_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_cls_test, y_cls_pred))


   Unnamed: 0        Date  AveragePrice  Total Volume     4046       4225  \
0           0  2015-12-27          1.33      64236.62  1036.74   54454.85   
1           1  2015-12-20          1.35      54876.98   674.28   44638.81   
2           2  2015-12-13          0.93     118220.22   794.70  109149.67   
3           3  2015-12-06          1.08      78992.15  1132.00   71976.41   
4           4  2015-11-29          1.28      51039.60   941.48   43838.39   

     4770  Total Bags  Small Bags  Large Bags  XLarge Bags          type  \
0   48.16     8696.87     8603.62       93.25          0.0  conventional   
1   58.33     9505.56     9408.07       97.49          0.0  conventional   
2  130.50     8145.35     8042.21      103.14          0.0  conventional   
3   72.58     5811.16     5677.40      133.76          0.0  conventional   
4   75.78     6183.95     5986.26      197.69          0.0  conventional   

   year  region  
0  2015  Albany  
1  2015  Albany  
2  2015  Albany  
3  2015 

KeyError: 'Region'