In [1]:
import zipfile
with zipfile.ZipFile('/content/mushroom.zip', 'r') as zip_ref:
    zip_ref.extractall('/content')

In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, RegressorMixin
import numpy as np
from sklearn.model_selection import train_test_split

In [3]:
columns = ['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
           'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
           'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
           'stalk-surface-below-ring', 'stalk-color-above-ring',
           'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
           'ring-type', 'spore-print-color', 'population', 'habitat']

df = pd.read_csv('agaricus-lepiota.data', names=columns)

In [4]:
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

In [6]:
# Any missing values?
df.isnull().sum()

Unnamed: 0,0
class,0
cap-shape,0
cap-surface,0
cap-color,0
bruises,0
odor,0
gill-attachment,0
gill-spacing,0
gill-size,0
gill-color,0


In [7]:
for col in df.select_dtypes(include='object').columns:
    print(f"\nValue counts for '{col}':")
    print(df[col].value_counts())


Value counts for 'class':
class
e    4208
p    3916
Name: count, dtype: int64

Value counts for 'cap-shape':
cap-shape
x    3656
f    3152
k     828
b     452
s      32
c       4
Name: count, dtype: int64

Value counts for 'cap-surface':
cap-surface
y    3244
s    2556
f    2320
g       4
Name: count, dtype: int64

Value counts for 'cap-color':
cap-color
n    2284
g    1840
e    1500
y    1072
w    1040
b     168
p     144
c      44
u      16
r      16
Name: count, dtype: int64

Value counts for 'bruises':
bruises
f    4748
t    3376
Name: count, dtype: int64

Value counts for 'odor':
odor
n    3528
f    2160
s     576
y     576
a     400
l     400
p     256
c     192
m      36
Name: count, dtype: int64

Value counts for 'gill-attachment':
gill-attachment
f    7914
a     210
Name: count, dtype: int64

Value counts for 'gill-spacing':
gill-spacing
c    6812
w    1312
Name: count, dtype: int64

Value counts for 'gill-size':
gill-size
b    5612
n    2512
Name: count, dtype: int64

Value 

In [8]:
df.shape

(8124, 23)

In [9]:
y = LabelEncoder().fit_transform(df['class'])
X = df.drop(columns=['class', 'veil-type'])

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
X_train.head()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-color,ring-number,ring-type,spore-print-color,population,habitat
7873,k,s,e,f,s,f,c,n,b,t,...,s,k,p,w,w,o,e,w,v,d
6515,x,s,n,f,f,f,c,n,b,t,...,k,s,w,w,w,o,e,w,v,p
6141,f,y,e,f,y,f,c,n,b,t,...,s,s,p,w,w,o,e,w,v,l
2764,f,f,n,t,n,f,c,b,u,t,...,s,s,g,p,w,o,p,n,v,d
438,b,y,y,t,l,f,c,b,k,e,...,s,s,w,w,w,o,p,n,n,m


In [12]:
y_train[:10]

array([1, 1, 1, 0, 0, 1, 0, 0, 1, 1])

In [13]:
class mymlr:

  def __init__(self):
     self.m = None
     self.b = None

  def fit(self, X_train, y_train):
    X_train = np.insert(X_train, 0, 1, axis=1)

    betas = np.linalg.inv(np.dot(X_train.T, X_train)).dot(X_train.T).dot(y_train)
    self.m, self.b = betas[1:], betas[0]
    print(betas)

  def predict(self, X_test):
    y_pred = np.dot(X_test, self.m) + self.b
    return y_pred

In [14]:
class MySLRWrapper(BaseEstimator, RegressorMixin):
    def __init__(self):
        self.model = None

    def fit(self, X, y):
        X = X.reshape(-1, 1) if X.ndim == 1 else X
        self.model = mymlr()
        self.model.fit(X, y)
        return self

    def predict(self, X):
        X = X.reshape(-1, 1) if X.ndim == 1 else X
        return self.model.predict(X)

In [15]:
preprocessor = ColumnTransformer([
    ('cat', Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop='first'))
    ]), X.columns)
])

In [16]:
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('select', SelectKBest(score_func=f_regression, k=5)),
    ('model', MySLRWrapper())
])

In [17]:
pipe.fit(X, y)

[ 0.7384089   0.18268891 -0.57348601  0.10099616  0.04291677 -0.19975463]


In [18]:
# Get feature names from preprocessor
feature_names = pipe.named_steps['preprocessor'].get_feature_names_out()

# Get mask of selected features
mask = pipe.named_steps['select'].get_support()

# Apply mask to feature names
selected_features = feature_names[mask]
print("Selected features:", selected_features)

Selected features: ['cat__odor_f' 'cat__odor_n' 'cat__stalk-surface-above-ring_k'
 'cat__stalk-surface-below-ring_k' 'cat__ring-type_p']


In [19]:
y_pred = pipe.predict(X_test)



In [20]:
from sklearn.metrics import mean_squared_error, r2_score

In [21]:
y_pred = pipe.predict(X_test)
print("MSE:", mean_squared_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))

MSE: 0.07018075852081793
R²: 0.7188808307484311




In [22]:
from sklearn.linear_model import LinearRegression
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('select', SelectKBest(score_func=f_regression, k=5)),
    ('model', LinearRegression())
])
pipe.fit(X_train, y_train)
# Get feature names from preprocessor
feature_names = pipe.named_steps['preprocessor'].get_feature_names_out()

# Get mask of selected features
mask = pipe.named_steps['select'].get_support()

# Apply mask to feature names
selected_features = feature_names[mask]
print("Selected features:", selected_features)
y_pred = pipe.predict(X_test)
print("MSE:", mean_squared_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))

Selected features: ['cat__odor_f' 'cat__odor_n' 'cat__gill-size_n'
 'cat__stalk-surface-above-ring_k' 'cat__stalk-surface-below-ring_k']
MSE: 0.051296714826241964
R²: 0.7945235964782257


In [23]:
y_pred_class = (y_pred >= 0.5).astype(int)
from sklearn.metrics import accuracy_score, confusion_matrix

print("Accuracy:", accuracy_score(y_test, y_pred_class))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_class))


Accuracy: 0.9704615384615385
Confusion matrix:
 [[816  27]
 [ 21 761]]
