In [None]:
!pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [None]:
from ucimlrepo import fetch_ucirepo

adult = fetch_ucirepo(id=2)

X = adult.data.features
y = adult.data.targets

print(adult.metadata)

print(adult.variables)


{'uci_id': 2, 'name': 'Adult', 'repository_url': 'https://archive.ics.uci.edu/dataset/2/adult', 'data_url': 'https://archive.ics.uci.edu/static/public/2/data.csv', 'abstract': 'Predict whether annual income of an individual exceeds $50K/yr based on census data. Also known as "Census Income" dataset. ', 'area': 'Social Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 48842, 'num_features': 14, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Income', 'Education Level', 'Other', 'Race', 'Sex'], 'target_col': ['income'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1996, 'last_updated': 'Tue Sep 24 2024', 'dataset_doi': '10.24432/C5XW20', 'creators': ['Barry Becker', 'Ronny Kohavi'], 'intro_paper': None, 'additional_info': {'summary': "Extraction was done by Barry Becker from the 1994 Census database.  A set of reasonably clean records was extracted using the fol

In [None]:
from ucimlrepo import fetch_ucirepo

adult = fetch_ucirepo(id=2)
X = adult.data.features

ages = X["age"]

min_age, max_age = min(ages), max(ages)
norm_ages = [(age - min_age) / (max_age - min_age) for age in ages]
print("Normalized Ages (first 5):", norm_ages[:5])

Normalized Ages (first 5): [0.3013698630136986, 0.4520547945205479, 0.2876712328767123, 0.4931506849315068, 0.1506849315068493]


In [None]:
y = adult.data.targets
sex_vals = X["sex"]
table = {"Male": {"<=50K": 0, ">50K": 0}, "Female": {"<=50K": 0, ">50K": 0}}

for s, inc in zip(sex_vals, y["income"]):
    table[s][inc.rstrip('.')] += 1

chi_sq = 0
total = sum(sum(v.values()) for v in table.values())

for s in table:
    row_sum = sum(table[s].values())
    for inc in ["<=50K", ">50K"]:
        col_sum = sum(table[x][inc] for x in table)
        expected = row_sum * col_sum / total
        observed = table[s][inc]
        chi_sq += ((observed - expected) ** 2) / expected

print("Chi-Square (sex vs income):", chi_sq)

Chi-Square (sex vs income): 2249.916167289077


In [None]:
actual = y
predicted = ["<=50K"] * len(actual)

tp = sum(1 for a, p in zip(actual, predicted) if a == ">50K" and p == ">50K")
tn = sum(1 for a, p in zip(actual, predicted) if a == "<=50K" and p == "<=50K")
fp = sum(1 for a, p in zip(actual, predicted) if a == "<=50K" and p == ">50K")
fn = sum(1 for a, p in zip(actual, predicted) if a == ">50K" and p == "<=50K")

print(f"TP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}")


TP: 0, TN: 0, FP: 0, FN: 0


In [None]:
from ucimlrepo import fetch_ucirepo
from math import sqrt

adult = fetch_ucirepo(id=2)
X = adult.data.features

male_ages = X[X["sex"] == "Male"]["age"]
female_ages = X[X["sex"] == "Female"]["age"]

mean_m = sum(male_ages) / len(male_ages)
mean_f = sum(female_ages) / len(female_ages)

std_m = sqrt(sum((x - mean_m) ** 2 for x in male_ages) / (len(male_ages) - 1))
std_f = sqrt(sum((x - mean_f) ** 2 for x in female_ages) / (len(female_ages) - 1))

n1, n2 = len(male_ages), len(female_ages)
se = sqrt((std_m ** 2) / n1 + (std_f ** 2) / n2)

z = (mean_m - mean_f) / se

print("Z-value:", z)

Z-value: 19.207107322053343


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import pandas as pd

X_numerical = X.select_dtypes(include=['int64', 'float64'])

X_numerical = X_numerical.fillna(X_numerical.mean())

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_numerical)

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

X_pca_df = pd.DataFrame(data = X_pca, columns = ['principal component 1', 'principal component 2'])

print("Original number of features:", X_numerical.shape[1])
print("Number of features after PCA:", X_pca_df.shape[1])
display(X_pca_df.head())

Original number of features: 6
Number of features after PCA: 2


Unnamed: 0,principal component 1,principal component 2
0,0.826878,-0.268252
1,-0.124083,-1.076496
2,-0.43224,0.094957
3,-0.477463,-0.458727
4,-0.084854,1.486242


In [None]:
import kagglehub

path = kagglehub.dataset_download("uciml/mushroom-classification")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/mushroom-classification


In [None]:
import pandas as pd

df = pd.read_csv("/kaggle/input/mushroom-classification/mushrooms.csv")
df.describe()


Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
count,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,...,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124
unique,2,6,4,10,2,9,2,2,2,12,...,4,9,9,1,4,3,5,9,6,7
top,e,x,y,n,f,n,f,c,b,b,...,s,w,w,p,w,o,p,w,v,d
freq,4208,3656,3244,2284,4748,3528,7914,6812,5612,1728,...,4936,4464,4384,8124,7924,7488,3968,2388,4040,3148


In [None]:
features = df.columns[1:]
hypothesis = ['0'] * len(features)

for index, row in df.iterrows():
    if row['class'] == 'e':
        for i, feature in enumerate(features):
            val = row[feature]
            if hypothesis[i] == '0':
                hypothesis[i] = val
            elif hypothesis[i] != val:
                hypothesis[i] = '?'

print("Final Hypothesis (FIND-S):", hypothesis)

Final Hypothesis (FIND-S): ['?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', 'p', '?', '?', '?', '?', '?', '?']


In [None]:
df_small = df[['class', 'cap-shape', 'cap-surface', 'cap-color']]
df_small.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color
0,p,x,s,n
1,e,x,s,y
2,e,b,s,w
3,p,x,y,w
4,e,x,s,g


In [None]:
import numpy as np

data = df_small.values
num_attributes = len(data[0]) - 1

S = ['ϕ'] * num_attributes
G = [['?'] * num_attributes]

print("Initial Specific Hypothesis (S):", S)
print("Initial General Hypotheses (G):", G)

for i, instance in enumerate(data):
    x = instance[1:]
    label = instance[0]

    if label == 'e':
        for j in range(num_attributes):
            if S[j] == 'ϕ':
                S[j] = x[j]
            elif S[j] != x[j]:
                S[j] = '?'

        G = [g for g in G if all(
            g[k] == '?' or g[k] == S[k] for k in range(num_attributes))]

    elif label == 'p':
        new_G = []

        for g in G:
            for j in range(num_attributes):
                if g[j] == '?':
                    if S[j] != '?':
                        new_hypothesis = g.copy()
                        new_hypothesis[j] = S[j]
                        if new_hypothesis[j] != x[j]:
                            new_G.append(new_hypothesis)

        G = new_G.copy()

    print(f"\nAfter instance {i+1} ({label}):")
    print("S =", S)
    print("G =", G)
