In [53]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [54]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Dataset/100_Sales.csv')

In [55]:
print(df.head())
print(df.info())

                              Region                Country        Item_Type  \
0              Australia and Oceania                 Tuvalu        Baby Food   
1  Central America and the Caribbean                Grenada           Cereal   
2                             Europe                 Russia  Office Supplies   
3                 Sub_Saharan Africa  Sao Tome and Principe           Fruits   
4                 Sub_Saharan Africa                 Rwanda  Office Supplies   

  Sales_Channel Order_Priority   Ship_Date  Unit_Cost  Total_Revenue  \
0       Offline              H  27/06/2010     159.42     2533654.00   
1        Online              C  15/09/2012     117.11      576782.80   
2       Offline              L  05/08/2014     524.96     1158502.59   
3        Online              C  07/05/2014       6.92       75591.66   
4       Offline              L  02/06/2013     524.96     3296425.02   

   Total_Profit  Unnamed: 9  Unnamed: 10  
0     951410.50         NaN          NaN  


In [56]:
column = ['Region', 'Country', 'Item_Type', 'Sales_Channel', 'Order_Priority', 'Ship_Date', 'Unit_Cost', 'Total_Revenue', 'Total_Profit']
df = df[column]

In [57]:
df.head()

Unnamed: 0,Region,Country,Item_Type,Sales_Channel,Order_Priority,Ship_Date,Unit_Cost,Total_Revenue,Total_Profit
0,Australia and Oceania,Tuvalu,Baby Food,Offline,H,27/06/2010,159.42,2533654.0,951410.5
1,Central America and the Caribbean,Grenada,Cereal,Online,C,15/09/2012,117.11,576782.8,248406.36
2,Europe,Russia,Office Supplies,Offline,L,05/08/2014,524.96,1158502.59,224598.75
3,Sub_Saharan Africa,Sao Tome and Principe,Fruits,Online,C,07/05/2014,6.92,75591.66,19525.82
4,Sub_Saharan Africa,Rwanda,Office Supplies,Offline,L,02/06/2013,524.96,3296425.02,639077.5


In [58]:
df['Profit_Category'] = pd.qcut(df['Total_Profit'], q=3, labels=['Low', 'Medium', 'High'])
df

Unnamed: 0,Region,Country,Item_Type,Sales_Channel,Order_Priority,Ship_Date,Unit_Cost,Total_Revenue,Total_Profit,Profit_Category
0,Australia and Oceania,Tuvalu,Baby Food,Offline,H,27/06/2010,159.42,2533654.00,951410.50,High
1,Central America and the Caribbean,Grenada,Cereal,Online,C,15/09/2012,117.11,576782.80,248406.36,Medium
2,Europe,Russia,Office Supplies,Offline,L,05/08/2014,524.96,1158502.59,224598.75,Medium
3,Sub_Saharan Africa,Sao Tome and Principe,Fruits,Online,C,07/05/2014,6.92,75591.66,19525.82,Low
4,Sub_Saharan Africa,Rwanda,Office Supplies,Offline,L,02/06/2013,524.96,3296425.02,639077.50,High
...,...,...,...,...,...,...,...,...,...,...
95,Sub_Saharan Africa,Mali,Clothes,Online,M,09/03/2011,35.84,97040.64,65214.72,Low
96,Asia,Malaysia,Fruits,Offline,L,28/12/2011,6.92,58471.11,15103.47,Low
97,Sub_Saharan Africa,Sierra Leone,Vegetables,Offline,C,29/06/2016,90.93,228779.10,93748.05,Low
98,North America,Mexico,Personal Care,Offline,M,08/08/2015,56.67,471336.91,144521.02,Low


In [60]:
label_encoders = {}
for column in df.select_dtypes(include=["object"]).columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

In [61]:
X = df.drop("Profit_Category", axis=1)
y = df["Profit_Category"]

In [62]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [63]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [64]:
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
dt_pred = dt_model.predict(X_test)

In [65]:
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

In [66]:
print("Decision Tree Accuracy:", accuracy_score(y_test, dt_pred))
print("Decision Tree Classification Report:\n", classification_report(y_test, dt_pred))
print("Decision Tree Confusion Matrix:\n", confusion_matrix(y_test, dt_pred))

Decision Tree Accuracy: 0.95
Decision Tree Classification Report:
               precision    recall  f1-score   support

        High       0.90      1.00      0.95         9
         Low       1.00      1.00      1.00         5
      Medium       1.00      0.83      0.91         6

    accuracy                           0.95        20
   macro avg       0.97      0.94      0.95        20
weighted avg       0.96      0.95      0.95        20

Decision Tree Confusion Matrix:
 [[9 0 0]
 [0 5 0]
 [1 0 5]]


In [67]:
print("Random Forest Accuracy:", accuracy_score(y_test, rf_pred))
print("Random Forest Classification Report:\n", classification_report(y_test, rf_pred))
print("Random Forest Confusion Matrix:\n", confusion_matrix(y_test, rf_pred))

Random Forest Accuracy: 0.95
Random Forest Classification Report:
               precision    recall  f1-score   support

        High       0.90      1.00      0.95         9
         Low       1.00      1.00      1.00         5
      Medium       1.00      0.83      0.91         6

    accuracy                           0.95        20
   macro avg       0.97      0.94      0.95        20
weighted avg       0.96      0.95      0.95        20

Random Forest Confusion Matrix:
 [[9 0 0]
 [0 5 0]
 [1 0 5]]


In [68]:
dt_param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [69]:
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt']
}

In [70]:
dt_grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42), dt_param_grid, cv=5, scoring='accuracy')
dt_grid_search.fit(X_train, y_train)
print("Best Decision Tree Parameters:", dt_grid_search.best_params_)
print("Best Decision Tree Accuracy:", dt_grid_search.best_score_)

Best Decision Tree Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best Decision Tree Accuracy: 0.95


In [71]:
rf_grid_search = GridSearchCV(RandomForestClassifier(random_state=42), rf_param_grid, cv=5, scoring='accuracy')
rf_grid_search.fit(X_train, y_train)
print("Best Random Forest Parameters:", rf_grid_search.best_params_)
print("Best Random Forest Accuracy:", rf_grid_search.best_score_)

Best Random Forest Parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best Random Forest Accuracy: 0.95


540 fits failed out of a total of 1080.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
540 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
s

In [72]:
best_dt_model = dt_grid_search.best_estimator_
best_rf_model = rf_grid_search.best_estimator_

In [73]:
best_dt_pred = best_dt_model.predict(X_test)
best_rf_pred = best_rf_model.predict(X_test)

In [74]:
print("Final Decision Tree Accuracy:", accuracy_score(y_test, best_dt_pred))
print("Final Random Forest Accuracy:", accuracy_score(y_test, best_rf_pred))

Final Decision Tree Accuracy: 0.95
Final Random Forest Accuracy: 0.9
