<a href="https://colab.research.google.com/github/Tamoziit/Data-Mining/blob/main/Na%C3%AFve_Bayes_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
!pip install numpy pandas scikit-learn



In [29]:
import pandas as pd

In [42]:
file_path = './dataset.csv'
data = pd.read_csv(file_path)

In [43]:
data.head()

Unnamed: 0,Weekend,Weather,Parents,Financial condition,Decision
0,W1,Sunny,Yes,Rich,Cinema
1,W2,Sunny,No,Rich,Play Tennis
2,W3,Windy,Yes,Rich,Cinema
3,W4,Rainy,Yes,Poor,Cinema
4,W5,Rainy,No,Rich,Stay in


In [44]:
import numpy as np
from collections import defaultdict
from sklearn.model_selection import train_test_split

In [45]:
X = data.iloc[:, 1:-1]   # all rows, from 2nd col to 2nd-last col
Y = data.iloc[:, -1]     # last column (target)

X, Y

(  Weather Parents Financial condition
 0   Sunny     Yes                Rich
 1   Sunny      No                Rich
 2   Windy     Yes                Rich
 3   Rainy     Yes                Poor
 4   Rainy      No                Rich
 5   Rainy     Yes                Poor
 6   Windy      No                Poor
 7   Windy      No                Rich
 8   Windy     Yes                Rich
 9   Sunny      No                Rich,
 0         Cinema
 1    Play Tennis
 2         Cinema
 3         Cinema
 4        Stay in
 5         Cinema
 6         Cinema
 7       Shopping
 8         Cinema
 9    Play Tennis
 Name: Decision, dtype: object)

In [46]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

print(X_train, "\n", X_test, "\n", Y_train, "\n", Y_test)

  Weather Parents Financial condition
5   Rainy     Yes                Poor
0   Sunny     Yes                Rich
7   Windy      No                Rich
2   Windy     Yes                Rich
9   Sunny      No                Rich
4   Rainy      No                Rich
3   Rainy     Yes                Poor
6   Windy      No                Poor 
   Weather Parents Financial condition
8   Windy     Yes                Rich
1   Sunny      No                Rich 
 5         Cinema
0         Cinema
7       Shopping
2         Cinema
9    Play Tennis
4        Stay in
3         Cinema
6         Cinema
Name: Decision, dtype: object 
 8         Cinema
1    Play Tennis
Name: Decision, dtype: object


In [47]:
class NaiveBayesClassifier:
  def fit(self, X, Y):
    self.classes = np.unique(Y)
    self.priors = {}
    self.cond_probs = defaultdict(lambda: defaultdict(dict))

    total_count = len(Y)

    for c in self.classes:
      X_c = X[Y == c]
      self.priors[c] = len(X_c) / total_count # Prior probabilities

      # Conditional probs for each attribute
      for col in X.columns:
        values = X[col].unique()
        for val in values:
          # Laplace smoothing
          prob = (np.sum(X_c[col] == val) + 1) / (len(X_c) + len(values))
          self.cond_probs[c][col][val] = prob


  def predict(self, X):
    predictions = []
    for _, row in X.iterrows():
      class_probs = {}

      for c in self.classes:
        prob = np.log(self.priors[c])
        for col in X.columns:
          val = row[col]
          prob += np.log(self.cond_probs[c][col].get(val, 1e-6))

        class_probs[c] = prob
      predictions.append(max(class_probs, key=class_probs.get)) # maximizing P(Cj) ∏ P(Ai|Cj)

    return predictions

In [48]:
nb = NaiveBayesClassifier()

In [49]:
nb.fit(X_train, Y_train)

In [50]:
print("Classes:", nb.classes)
print("Prior Probabilities:", nb.priors)

Classes: ['Cinema' 'Play Tennis' 'Shopping' 'Stay in']
Prior Probabilities: {'Cinema': 0.625, 'Play Tennis': 0.125, 'Shopping': 0.125, 'Stay in': 0.125}


In [51]:
# Converting conditional probabilities dict to a dataframe
cond_prob_table = []

for c, feature_dict in nb.cond_probs.items():
    for feature, value_dict in feature_dict.items():
        for val, prob in value_dict.items():
            cond_prob_table.append({
                "Class": c,
                "Feature": feature,
                "Value": val,
                "P(Value|Class)": round(float(prob), 4)
            })

cond_prob_df = pd.DataFrame(cond_prob_table)

print(cond_prob_df)

          Class              Feature  Value  P(Value|Class)
0        Cinema              Weather  Rainy          0.3750
1        Cinema              Weather  Sunny          0.2500
2        Cinema              Weather  Windy          0.3750
3        Cinema              Parents    Yes          0.7143
4        Cinema              Parents     No          0.2857
5        Cinema  Financial condition   Poor          0.5714
6        Cinema  Financial condition   Rich          0.4286
7   Play Tennis              Weather  Rainy          0.2500
8   Play Tennis              Weather  Sunny          0.5000
9   Play Tennis              Weather  Windy          0.2500
10  Play Tennis              Parents    Yes          0.3333
11  Play Tennis              Parents     No          0.6667
12  Play Tennis  Financial condition   Poor          0.3333
13  Play Tennis  Financial condition   Rich          0.6667
14     Shopping              Weather  Rainy          0.2500
15     Shopping              Weather  Su

In [52]:
Y_pred = nb.predict(X_test)

results = X_test.copy()
results["Actual"] = Y_test.values
results["Predicted"] = Y_pred

print("\nNaive Bayes Predictions on Test Data:\n")
print(results.to_string(index=False))


Naive Bayes Predictions on Test Data:

Weather Parents Financial condition      Actual   Predicted
  Windy     Yes                Rich      Cinema      Cinema
  Sunny      No                Rich Play Tennis Play Tennis


In [53]:
accuracy = np.mean(Y_pred == Y_test)
accuracy

np.float64(1.0)

In [54]:
!pip install pypandoc

Collecting pypandoc
  Downloading pypandoc-1.15-py3-none-any.whl.metadata (16 kB)
Downloading pypandoc-1.15-py3-none-any.whl (21 kB)
Installing collected packages: pypandoc
Successfully installed pypandoc-1.15


In [55]:
!apt-get install -y pandoc

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  libcmark-gfm-extensions0.29.0.gfm.3 libcmark-gfm0.29.0.gfm.3 pandoc-data
Suggested packages:
  texlive-latex-recommended texlive-xetex texlive-luatex pandoc-citeproc
  texlive-latex-extra context wkhtmltopdf librsvg2-bin groff ghc nodejs php
  python ruby libjs-mathjax libjs-katex citation-style-language-styles
The following NEW packages will be installed:
  libcmark-gfm-extensions0.29.0.gfm.3 libcmark-gfm0.29.0.gfm.3 pandoc
  pandoc-data
0 upgraded, 4 newly installed, 0 to remove and 35 not upgraded.
Need to get 20.6 MB of archives.
After this operation, 156 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libcmark-gfm0.29.0.gfm.3 amd64 0.29.0.gfm.3-3 [115 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libcmark-gfm-extensions0.29.0.gfm.3 amd64 0.29.0.gfm.3-3 [25.1 kB

In [57]:
import pypandoc
pypandoc.convert_file("Naïve_Bayes_Classifier.ipynb", "docx", outputfile="Naïve_Bayes_Classifier.docx")

''