# Importing necessary libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
import pickle

# Reading data

In [None]:
df = pd.read_csv('Data/hazardous_EAV2-6_32151-0002.csv')
df.head()

### Removing extra space from 'code_new' column

In [None]:
df['code_new'] = df['code_new'].str.strip()
df.head()

### Creating target variable by mapping it with EAV codes

In [None]:
def extract_waste_code(code_new):
    return code_new[:6]

# Create a new column 'waste_category' and label the waste types
df['waste_category'] = df['code_new'].apply(extract_waste_code)


#Category Mapping
category_mapping = {
    'EAV-01': 'Mineral Waste',
    'EAV-02': 'Agricultural Waste',
    'EAV-03': 'Wooden Waste',
    'EAV-04': 'Leather & Textile Waste',
    'EAV-05': 'Petroleum & Gas Waste',
    'EAV-06': 'Inorganic Chemical Waste',
    'EAV-07': 'Organic Chemical Waste',
    'EAV-08': 'Coating & Adhesive Waste',
    'EAV-09': 'Photographic Industry Waste',
    'EAV-10': 'Thermal Waste',
    'EAV-11': 'Chemical Treatment Waste',
    'EAV-12': 'Surface Treatment of metal, plastic Waste',
    'EAV-13': 'Oil, liquid fuel Waste',
    'EAV-14': 'Organic solvents, refrigerant Waste',
    'EAV-15': 'Packaging, Absorbents Waste',
    'EAV-16': 'Other Waste',
    'EAV-17': 'Construction and demolition Waste',
    'EAV-18': 'Human & Animal Health Care Waste',
    'EAV-19': 'Waste Management Facilities Waste',
    'EAV-20': 'Municipal (Household, Similar) Waste'
}

df['waste_category'] = df['waste_category'].map(category_mapping)

### Removing old 'code' column

In [None]:
df = df.drop(columns=['code'])

In [11]:
df.tail()

Unnamed: 0,year,type of waste,waste producers,waste quantities,waste quantities generated by primary producers,code_new,waste_category
6795,2021,Detergents containing hazardous substances,85,1.1,0.9,EAV-200129-G,"Municipal (Household, Similar) Waste"
6796,2021,Cytotoxic and cytostatic medicines,0,0.0,0.0,EAV-200131-G,"Municipal (Household, Similar) Waste"
6797,2021,Batteries and accumulators (160601-160603),76,1.7,1.6,EAV-200133-G,"Municipal (Household, Similar) Waste"
6798,2021,"Discarded electrical equipment (ex. 200121,23)",137,27.0,8.7,EAV-200135-G,"Municipal (Household, Similar) Waste"
6799,2021,Wood containing hazardous substances,140,40.9,28.7,EAV-200137-G,"Municipal (Household, Similar) Waste"


### Splitting into train & test

In [None]:
# Assuming 'X' are your features and 'y' is the target variable
X = df[['waste producers', 'waste quantities', 'waste quantities generated by primary producers']]
y = df['waste_category']

# Encode the categorical labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest Classifier

In [None]:
steps = [
    ('standardization', StandardScaler()),  # Step 1: Standardize the data
    ('classification', RandomForestClassifier()),  # Step 2: Apply the randomforest model 
]

# Create the pipeline
pipeline = Pipeline(steps)

# Fit the pipeline on your data

pipeline.fit(X_train, y_train)

### Hyperparameter Tuning

In [None]:
# Define the hyperparameters and their possible values
param_grid = {
    'classification__n_estimators': [50, 75, 100],  # Number of trees in the forest
    'classification__max_depth': [None, 7, 12],  # Maximum depth of the trees
}

# Initialize the GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters and best estimator from the grid search
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

# View the best parameters
print(f'Best Parameters: {best_params}')

# Evaluate the model with best hyperparameters on the test set
y_test_pred = best_estimator.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)

# Print the test accuracy
print(f'Test Accuracy with Best Hyperparameters: {test_accuracy}')

## Evaluating RandomForest

In [None]:
# Predict on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Classification Report:\n{report}')

### Testing on random values

In [14]:
X_new = np.array([[72, 1.5, 1.4]])
pred_new = best_estimator.predict(X_new)
original_predictions = label_encoder.inverse_transform(pred_new)
print(original_predictions)

['Human & Animal Health Care Waste']




### Saving best model 

In [None]:
pickle.dump(best_estimator,open('classification_model.pkl','wb'))

# Trying other models
### Decision Tree Classifier

In [None]:
# Initialize the model
model = DecisionTreeClassifier()

# Train the model
model.fit(X_train, y_train)

### Evaluating DecisionTree

In [None]:
# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Classification Report:\n{report}')

### XG Boost Classifier

In [None]:
# Initialize the model (XGBoost Classifier)
model3 = XGBClassifier(learning_rate=0.01, n_estimators=1000, max_depth=5, subsample=0.8, random_state=42)

# Train the model
model3.fit(X_train, y_train)

### Evaluating XG Boost

In [None]:
# Predict on the test set
y_pred3 = model3.predict(X_test)

# Evaluate the model
accuracy3 = accuracy_score(y_test, y_pred3)
report3 = classification_report(y_test, y_pred3)

print(f'Accuracy: {accuracy3}')
print(f'Classification Report:\n{report3}')