In [60]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

In [46]:
# Load the dataset
url = "https://github.com/dsrscientist/Data-Science-ML-Capstone-Projects/raw/master/avocado.csv.zip"
df = pd.read_csv(url)

In [47]:
df

Unnamed: 0.1,Unnamed: 0,Date,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region
0,0,2015-12-27,1.33,64236.62,1036.74,54454.85,48.16,8696.87,8603.62,93.25,0.0,conventional,2015,Albany
1,1,2015-12-20,1.35,54876.98,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.0,conventional,2015,Albany
2,2,2015-12-13,0.93,118220.22,794.70,109149.67,130.50,8145.35,8042.21,103.14,0.0,conventional,2015,Albany
3,3,2015-12-06,1.08,78992.15,1132.00,71976.41,72.58,5811.16,5677.40,133.76,0.0,conventional,2015,Albany
4,4,2015-11-29,1.28,51039.60,941.48,43838.39,75.78,6183.95,5986.26,197.69,0.0,conventional,2015,Albany
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18244,7,2018-02-04,1.63,17074.83,2046.96,1529.20,0.00,13498.67,13066.82,431.85,0.0,organic,2018,WestTexNewMexico
18245,8,2018-01-28,1.71,13888.04,1191.70,3431.50,0.00,9264.84,8940.04,324.80,0.0,organic,2018,WestTexNewMexico
18246,9,2018-01-21,1.87,13766.76,1191.92,2452.79,727.94,9394.11,9351.80,42.31,0.0,organic,2018,WestTexNewMexico
18247,10,2018-01-14,1.93,16205.22,1527.63,2981.04,727.01,10969.54,10919.54,50.00,0.0,organic,2018,WestTexNewMexico


In [49]:
# Classification Task: Predict Region
# Define target variable for classification
y_classif = df['region']

In [51]:
# Drop columns not needed for classification
X_classif = df.drop(columns=['region', 'Date'])

In [52]:
# Encode categorical variables for classification
X_classif = pd.get_dummies(X_classif, columns=['type', 'year'], drop_first=True)

In [53]:
# Split the data into training and testing sets for classification
X_train_classif, X_test_classif, y_train_classif, y_test_classif = train_test_split(X_classif, y_classif, test_size=0.2, random_state=42)

In [61]:
# Choose and train a classification model
clf = LogisticRegression()
clf.fit(X_train_classif, y_train_classif)

LogisticRegression()

In [55]:
# Make predictions for classification
y_pred_classif = clf.predict(X_test_classif)


In [56]:
y_pred_classif

array(['Southeast', 'Southeast', 'LosAngeles', ..., 'PhoenixTucson',
       'Seattle', 'Seattle'], dtype=object)

In [62]:
# Evaluate the classification model
accuracy_classif = accuracy_score(y_test_classif, y_pred_classif)
report_classif = classification_report(y_test_classif, y_pred_classif)

In [63]:
print("Classification Task Results:")
print(f'Classification Model Accuracy: {accuracy_classif}')
print(report_classif)

Classification Task Results:
Classification Model Accuracy: 0.12602739726027398
                     precision    recall  f1-score   support

             Albany       0.00      0.00      0.00        63
            Atlanta       1.00      0.01      0.03        73
BaltimoreWashington       0.51      0.43      0.46        61
              Boise       0.00      0.00      0.00        72
             Boston       0.06      0.06      0.06        67
   BuffaloRochester       0.00      0.00      0.00        73
         California       0.12      0.34      0.18        62
          Charlotte       0.00      0.00      0.00        76
            Chicago       0.62      0.60      0.61        72
   CincinnatiDayton       0.00      0.00      0.00        65
           Columbus       0.00      0.00      0.00        65
      DallasFtWorth       0.00      0.00      0.00        68
             Denver       0.07      0.45      0.13        65
            Detroit       0.38      0.18      0.24        66
    

# Regression Task: Predict Average Price

In [64]:
# Define target variable for regression
y_reg = df['AveragePrice']

In [65]:
# Drop columns not needed for regression
X_reg = df.drop(columns=['AveragePrice', 'Date'])

In [67]:
# Encode categorical variables for regression
X_reg = pd.get_dummies(X_reg, columns=['type', 'year', 'region'], drop_first=True)

In [68]:
# Split the data into training and testing sets for regression
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

In [69]:
# Choose and train a regression model
reg = LinearRegression()
reg.fit(X_train_reg, y_train_reg)

LinearRegression()

In [70]:
# Make predictions for regression
y_pred_reg = reg.predict(X_test_reg)

In [71]:
# Evaluate the regression model
mae_reg = mean_absolute_error(y_test_reg, y_pred_reg)
mse_reg = mean_squared_error(y_test_reg, y_pred_reg)
r2_reg = r2_score(y_test_reg, y_pred_reg)

In [72]:
print("\nRegression Task Results:")
print(f'Mean Absolute Error: {mae_reg}')
print(f'Mean Squared Error: {mse_reg}')
print(f'R-squared: {r2_reg}')


Regression Task Results:
Mean Absolute Error: 0.18797991755848695
Mean Squared Error: 0.06256052409477293
R-squared: 0.610625001382063
