In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
import pickle
import numpy as np


In [2]:
Fulldf = pd.read_csv('waste_data.csv')

In [3]:
Fulldf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24108 entries, 0 to 24107
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   date               24108 non-null  int64
 1   month              24108 non-null  int64
 2   year               24108 non-null  int64
 3   Load_Type          24108 non-null  int64
 4   Total_Load_Weight  24108 non-null  int64
dtypes: int64(5)
memory usage: 941.8 KB


In [4]:
Fulldf.isnull().sum()
Fulldf = Fulldf.dropna()
Fulldf.describe(include='all')

Unnamed: 0,date,month,year,Load_Type,Total_Load_Weight
count,24108.0,24108.0,24108.0,24108.0,24108.0
mean,15.730463,6.522648,2015.000747,3.5,29.951178
std,8.800623,3.448852,3.16254,1.707861,15.789526
min,1.0,1.0,2010.0,1.0,8.0
25%,8.0,4.0,2012.0,2.0,18.0
50%,16.0,7.0,2015.0,3.5,26.0
75%,23.0,10.0,2018.0,5.0,36.0
max,31.0,12.0,2020.0,6.0,85.0


In [5]:
Fulldf.head()

Unnamed: 0,date,month,year,Load_Type,Total_Load_Weight
0,1,1,2010,1,32
1,2,1,2010,1,35
2,3,1,2010,1,34
3,4,1,2010,1,34
4,5,1,2010,1,26


In [6]:
Fulldf = pd.DataFrame(Fulldf)


In [7]:
Fulldf.corr()

Unnamed: 0,date,month,year,Load_Type,Total_Load_Weight
date,1.0,0.01039798,0.0003559882,-1.695122e-15,-0.000317
month,0.01039798,1.0,-0.0003096076,-2.920253e-15,-0.002441
year,0.0003559882,-0.0003096076,1.0,2.948351e-12,0.361733
Load_Type,-1.695122e-15,-2.920253e-15,2.948351e-12,1.0,-0.643129
Total_Load_Weight,-0.0003174006,-0.002440532,0.3617328,-0.6431294,1.0


In [8]:
X = Fulldf.drop('Total_Load_Weight', axis=1).values
Y = Fulldf['Total_Load_Weight'].values


In [9]:
x_train,x_test,y_train,y_test = train_test_split(X,Y, test_size=0.2)


In [10]:
model=DecisionTreeClassifier(criterion='gini' 
                            , max_depth= 20
                            , max_features='auto' 
                            , min_samples_leaf= 2
                            , min_samples_split=2)
model.fit(x_train,y_train)

DecisionTreeClassifier(max_depth=100, max_features='auto', min_samples_split=5)

In [17]:
# Writing different model files to file
with open( 'wasteModel.sav', 'wb') as f:
    pickle.dump(model,f)

In [10]:
year = int(input("Input Year: "))
date = input("Input date ")
month = input("Input Month : ")
ty = str(input("input type in block capitals: "))
prediction = model.predict([[date , month , year , ty]])
print(prediction)

[19]


  return f(*args, **kwargs)


# Hyper parameter Tuning

In [13]:
#create the data
X = Fulldf.drop('Total_Load_Weight', axis=1).values# Input features (attributes)
y = Fulldf['Total_Load_Weight'].values # Target vector
print('X shape: {}'.format(np.shape(X)))
print('y shape: {}'.format(np.shape(y)))

X shape: (24108, 4)
y shape: (24108,)


In [14]:
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [10,20]
# Minimum number of samples required to split a node
min_samples_split = [2, 5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]

In [15]:
# Create the param grid
param_grid = { 'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}
print(param_grid)

{'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2]}


In [16]:
#Instantiate and fit model
model = DecisionTreeClassifier()

In [18]:
rf_Grid = GridSearchCV(estimator = model, param_grid = param_grid, cv = 20, verbose=2, n_jobs = 4)

In [19]:
rf_Grid.fit(x_train, y_train)

Fitting 20 folds for each of 16 candidates, totalling 320 fits




GridSearchCV(cv=20, estimator=DecisionTreeClassifier(), n_jobs=4,
             param_grid={'max_depth': [10, 20],
                         'max_features': ['auto', 'sqrt'],
                         'min_samples_leaf': [1, 2],
                         'min_samples_split': [2, 5]},
             verbose=2)

In [20]:
rf_Grid.best_params_

{'max_depth': 20,
 'max_features': 'auto',
 'min_samples_leaf': 2,
 'min_samples_split': 2}