In [18]:
import logging
logging.basicConfig(filename="logging.log",level=logging.DEBUG, encoding='utf-8',format="%(asctime)s %(levelname)s %(message)s")

In [19]:
# sqlcommand >> csv >> save >> database(dataastra)
logging.info("Connecting Database")
from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider
import json
cloud_config= {
  'secure_connect_bundle': 'secure-connect-db-mushroom.zip'
}
with open("../db_mushroom-token.json") as f:
    secrets = json.load(f)

CLIENT_ID = secrets["clientId"]
CLIENT_SECRET = secrets["secret"]

auth_provider = PlainTextAuthProvider(CLIENT_ID, CLIENT_SECRET)
cluster = Cluster(cloud=cloud_config, auth_provider=auth_provider)
session = cluster.connect('keyspace_mushroom')
logging.info("Database Connection done...")

In [None]:
logging.info("Trying to Insert Mushroom dataset into database")
from uuid import uuid1
table_name = "mushrooms"
column_names=['class','cap_shape','cap_surface','cap_color','bruises','odor','gill_attachment','gill_spacing','gill_size','gill_color','stalk_shape','stalk_root','stalk_surface_above_ring','stalk_surface_below_ring','stalk_color_above_ring','stalk_color_below_ring','veil_type','veil_color','ring_number','ring_type','spore_print_color','population','habitat']
column_names=sorted(column_names)
column_names.insert(0,'id')
import csv
# Open and process CSV file
with open("mushrooms.csv", "r") as csvfile:
    reader = csv.reader(csvfile)
    if next(reader, None):  # Skip header row (if present)
        for row in reader:
            # Prepare data (convert types if necessary)
            row.insert(0,uuid1())
            # Build the CQL insert statement (dynamically)
            insert_query = f"INSERT INTO {table_name} ({', '.join(column_names)}) VALUES ({', '.join(['?'] * len(column_names))})"

            # Prepare and execute the insert
            prepared_statement = session.prepare(insert_query)
            bound_statement = prepared_statement.bind(row)
            # for i, value in enumerate(row):
            #     print(i)
            #     print(value)
            #     bound_statement.set_value(i, value)
            session.execute(bound_statement)

# Close connections
session.shutdown()
logging.info("Uploading Done....")

In [22]:

# Loading the dataset
logging.info("Loading the Mushroom dataset")

import pandas as pd
try:
    df=pd.read_csv("mushrooms.csv",header=0)  # Loading this mushrooms.csv file
except Exception as e:
    logging.error(f"Catch an error while loading the Dataset {e}")
logging.info("Done Loading...")
df


Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,...,s,o,o,p,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l


In [23]:
df.iloc[1,1:]

cap-shape                   x
cap-surface                 s
cap-color                   y
bruises                     t
odor                        a
gill-attachment             f
gill-spacing                c
gill-size                   b
gill-color                  k
stalk-shape                 e
stalk-root                  c
stalk-surface-above-ring    s
stalk-surface-below-ring    s
stalk-color-above-ring      w
stalk-color-below-ring      w
veil-type                   p
veil-color                  w
ring-number                 o
ring-type                   p
spore-print-color           n
population                  n
habitat                     g
Name: 1, dtype: object

In [24]:
try:
    logging.info("Trying to map characters into numeric values")
    df['cap-shape']=df['cap-shape'].map({'b':1,'c':2,'x':3,'f':4,'k':5,'s':6})
    df['cap-surface']=df['cap-surface'].map({'f':1,'g':2,'y':3,'s':4})
    df['cap-color']=df['cap-color'].map({'n':1,'b':2,'c':3,'g':4,'r':5,'p':6,'u':7,'e':8,'w':9,'y':10})
    df['odor']=df['odor'].map({'a':1,'l':2,'c':3,'y':4,'f':5,'m':6,'n':7,'p':8,'s':9})
    df['bruises']=df['bruises'].map({'t':1,'f':2,})
    df['gill-attachment']=df['gill-attachment'].map({'a':1,'d':2,'f':3,'n':4})
    df['class']=df['class'].map({'p':1,'e':2,})
    df['gill-spacing']=df['gill-spacing'].map({'c':1,'w':2,'d':3})
    df['gill-size']=df['gill-size'].map({'b':1,'n':2,})
    df['gill-color']=df['gill-color'].map({'k':1,'n':2,'b':3,'h':4,'g':5,'r':6,'o':7,'p':8,'u':9,'e':10,'w':11,'y':12})
    df['stalk-shape']=df['stalk-shape'].map({'e':1,'t':2,})
    df['stalk-root']=df['stalk-root'].map({'b':1,'c':2,'u':3,'e':4,'z':5,'r':6,'?':7})
    df['stalk-color-above-ring']=df['stalk-color-above-ring'].map({'n':1,'b':2,'c':3,'g':4,'o':5,'p':6,'e':7,'w':8,'y':9})
    df['stalk-color-below-ring']=df['stalk-color-below-ring'].map({'n':1,'b':2,'c':3,'g':4,'o':5,'p':6,'e':7,'w':8,'y':9})
    df['veil-type']=df['veil-type'].map({'p':1,'u':2})
    df['veil-color']=df['veil-color'].map({'n':1,'o':2,'w':3,'y':4})
    df['ring-number']=df['ring-number'].map({'n':1,'o':2,'t':3})
    df['ring-type']=df['ring-type'].map({'c':1,'e':2,'f':3,'l':4,'n':5,'p':6,'s':7,'z':8})
    df['spore-print-color']=df['spore-print-color'].map({'k':1,'n':2,'b':3,'h':4,'r':5,'o':6,'u':7,'w':8,'y':9})
    df['population']=df['population'].map({'a':1,'c':2,'n':3,'s':4,'v':5,'y':6})
    df['habitat']=df['habitat'].map({'g':1,'l':2,'m':3,'p':4,'u':5,'w':6,'d':7})
    df['stalk-surface-above-ring']=df['stalk-surface-above-ring'].map({'f':1,'y':2,'k':3,'s':4})
    df['stalk-surface-below-ring']=df['stalk-surface-below-ring'].map({'f':1,'y':2,'k':3,'s':4})
    logging.info("Mapping Done...")
except Exception as e:
    logging.error(f"Catch an error {e}")


In [25]:
df.describe()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
count,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,...,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0,8124.0
mean,1.517971,3.491876,2.742984,5.323486,1.584441,5.788282,2.948301,1.161497,1.309207,5.729444,...,3.424914,6.446578,6.393402,1.0,2.965534,2.069424,4.291974,4.062038,4.644018,4.221073
std,0.499708,0.901287,1.179629,3.444391,0.492848,1.983678,0.317391,0.368011,0.462195,3.342402,...,0.870347,2.1439,2.194604,0.0,0.242669,0.271064,1.801672,2.825308,1.252082,2.530692
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0
25%,1.0,3.0,1.0,1.0,1.0,5.0,3.0,1.0,1.0,3.0,...,3.0,6.0,6.0,1.0,3.0,2.0,2.0,2.0,4.0,1.0
50%,2.0,3.0,3.0,4.0,2.0,7.0,3.0,1.0,1.0,5.0,...,4.0,8.0,8.0,1.0,3.0,2.0,4.0,4.0,5.0,4.0
75%,2.0,4.0,4.0,9.0,2.0,7.0,3.0,1.0,2.0,8.0,...,4.0,8.0,8.0,1.0,3.0,2.0,6.0,8.0,5.0,7.0
max,2.0,6.0,4.0,10.0,2.0,9.0,3.0,2.0,2.0,12.0,...,4.0,9.0,9.0,1.0,4.0,3.0,6.0,9.0,6.0,7.0


In [26]:

df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,3,4,1,1,8,3,1,2,1,...,4,8,8,1,3,2,6,1,4,5
1,2,3,4,10,1,1,3,1,1,1,...,4,8,8,1,3,2,6,2,3,1
2,2,1,4,9,1,2,3,1,1,2,...,4,8,8,1,3,2,6,2,3,3
3,1,3,3,9,1,8,3,1,2,2,...,4,8,8,1,3,2,6,1,4,5
4,2,3,4,4,2,7,3,2,1,1,...,4,8,8,1,3,2,2,2,1,1


In [27]:
df.iloc[0,:]

class                       1
cap-shape                   3
cap-surface                 4
cap-color                   1
bruises                     1
odor                        8
gill-attachment             3
gill-spacing                1
gill-size                   2
gill-color                  1
stalk-shape                 1
stalk-root                  4
stalk-surface-above-ring    4
stalk-surface-below-ring    4
stalk-color-above-ring      8
stalk-color-below-ring      8
veil-type                   1
veil-color                  3
ring-number                 2
ring-type                   6
spore-print-color           1
population                  4
habitat                     5
Name: 0, dtype: int64

In [28]:
logging.info(f"saving column names inside variable named as feature_name")
feature_name = df.columns  # saving column names inside variable named as feature_name

In [29]:
df.min()

class                       1
cap-shape                   1
cap-surface                 1
cap-color                   1
bruises                     1
odor                        1
gill-attachment             1
gill-spacing                1
gill-size                   1
gill-color                  1
stalk-shape                 1
stalk-root                  1
stalk-surface-above-ring    1
stalk-surface-below-ring    1
stalk-color-above-ring      1
stalk-color-below-ring      1
veil-type                   1
veil-color                  1
ring-number                 1
ring-type                   2
spore-print-color           1
population                  1
habitat                     1
dtype: int64

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   class                     8124 non-null   int64
 1   cap-shape                 8124 non-null   int64
 2   cap-surface               8124 non-null   int64
 3   cap-color                 8124 non-null   int64
 4   bruises                   8124 non-null   int64
 5   odor                      8124 non-null   int64
 6   gill-attachment           8124 non-null   int64
 7   gill-spacing              8124 non-null   int64
 8   gill-size                 8124 non-null   int64
 9   gill-color                8124 non-null   int64
 10  stalk-shape               8124 non-null   int64
 11  stalk-root                8124 non-null   int64
 12  stalk-surface-above-ring  8124 non-null   int64
 13  stalk-surface-below-ring  8124 non-null   int64
 14  stalk-color-above-ring    8124 non-null 

In [31]:
try:
    logging.info(f"divided features into input x and output y")
    y=df['class']
    x=df.drop(['class'],axis='columns')
except Exception as e:
    logging.error(f"error while spliting into x and y {e}")
logging.info(f"so our input size will become {x.shape} and output size will become {y.shape}")

In [32]:
import numpy as np
try:
    logging.info(f"Converting input and output into numpy array and reshaping it")
    x=np.array(x)
    y=np.array(y).reshape(-1,1)
except Exception as e:
    logging.error(f"error while converting into numpy array and reshaping it {e}")

In [33]:
from sklearn.preprocessing import MinMaxScaler
try:
    logging.info(f"Scaling the input data and output data using MinMaxScaler")
    scaler = MinMaxScaler((0,1))
    x = scaler.fit_transform(x)
    y = scaler.fit_transform(y)
except Exception as e:
    logging.error(f"Error while doing MinMaxScaler the input and output data {e}")

In [34]:
x[0]

array([0.4       , 1.        , 0.        , 0.        , 0.875     ,
       1.        , 0.        , 1.        , 0.        , 0.        ,
       0.5       , 1.        , 1.        , 0.875     , 0.875     ,
       0.        , 0.66666667, 0.5       , 1.        , 0.        ,
       0.6       , 0.66666667])

In [35]:
logging.info(f"Now x and y shape will be x = {x.shape} y = {y.shape}")

In [36]:
from sklearn.model_selection import train_test_split
try:
    logging.info(f"splitting the data into training and testing sets")
    x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3)
    logging.info(f"training set size is {x_train.shape} and testing set size is {x_test.shape}")
except Exception as e:
    logging.error(f"error while splitting the data into training and testing sets {e}")

In [37]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
logging.info(f"Initialize different Classifications model")
model={
    'svm':{
        'model':SVC(gamma='auto'),
        'params':{
            'C':[1,10,20],
            'kernel':['rbf','linear']
        }
    },
    'decision_tree':{
        'model':DecisionTreeClassifier(),
        'params':{
            'criterion':['gini', 'entropy', 'log_loss'],
            'splitter' :['best', 'random']
        }
    },
    'random_forest':{
        'model':RandomForestClassifier(),
        'params':{
            'n_estimators':[1,5,10]
        }
    },
    'Logistic_Regression':{
        'model':LogisticRegression(solver='liblinear',multi_class='auto'),
        'params':{
            'C':[1,5,10]
        }
    }
}

In [38]:
from sklearn.model_selection import GridSearchCV
try:
    logging.info(f"Testing different models for classification using GridSearchCV ")
    scores=[]
    for mn,mp in model.items():
        clf2=GridSearchCV(mp['model'],mp['params'],cv=10,return_train_score=True,scoring='recall')
        clf2.fit(x_train,y_train)
        scores.append({
            'model':mn,
            'best_score':clf2.best_score_,
            'best_params':clf2.best_params_
        })
    scores
except Exception as e:
    logging.error(f"Error while testing the models {e}")

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

In [39]:
logging.info(f"Select the best final model with best parameters and fit the data in model")
final_model=DecisionTreeClassifier(criterion='gini',splitter='best')
final_model.fit(x,y)

In [40]:
logging.info(f"Testing the model with test data ")
y_pred = final_model.predict(x_test)
y_train_pred = final_model.predict(x_train)

In [41]:
df.iloc[0,1:].shape

(22,)

In [42]:
import numpy as np
array = np.array(df.iloc[3,1:])

In [43]:
array.shape

(22,)

In [44]:
final_model.predict([array])

array([1.])

In [45]:
logging.info(f"Analyzing the model with confusion matrix ")
from sklearn.metrics import confusion_matrix,recall_score
print("for train data")
print(confusion_matrix(y_train,y_train_pred))
print("for testing data")
print(confusion_matrix(y_test,y_pred))

for train data
[[2747    0]
 [   0 2939]]
for testing data
[[1169    0]
 [   0 1269]]


In [46]:
logging.info(f"Checking the recall score of model ")
print("for train data")
print(recall_score(y_train,y_train_pred))

print("for test data")
print(recall_score(y_test,y_pred))


for train data
1.0
for test data
1.0


In [47]:
import pickle
try:
    logging.info(f"Export the model using pickle")
    with open("mushroom_classification.pkl",'wb') as f:
        pickle.dump(final_model,f)
        f.close()
except Exception as e:
    logging.error(f"Error while exporting the model {e}")