In [26]:
import pandas as pd
from collections import Counter
import numpy as np

date = pd.read_excel('tema_2_date_interviu.xlsx')
date = date.fillna("empty")
date.columns

Index(['COMPANIA', 'INDUSTRIA', 'ID_CANDIDAT', 'SEX', 'LOCATIA_COMPANIEI',
       'LOCATIA_CANDIDATULUI', 'RASPUNS_LA_TELEFON', 'A_VENIT_LA_INTERVIU',
       'STARE_CIVILA'],
      dtype='object')

In [144]:
class NaiveBayesClassifier():
    
    def __init__(self, date, no_labels):
        self.data = date
        self.no_lables = no_labels
        self.prior = [1/no_labels for _ in range(no_labels)]
        self.yes_counter = 0
        self.no_counter = 0

        self.coloane_de_interes = [column for column in self.data.columns if column != 'A_VENIT_LA_INTERVIU']
        self.prob_map = {column:{} for column in self.coloane_de_interes}
        
        self.prob_map = [[] for _ in range(no_labels)]
        self.prob_map[0] = {column: {} for column in self.coloane_de_interes}
        self.prob_map[1] = {column: {} for column in self.coloane_de_interes}
        
        self.words = {'yes':{column: [] for column in self.coloane_de_interes}, 'no':{column: [] for column in self.coloane_de_interes}}
        
    def fit(self):
        self.adauga_cuvinte_in_lista()
        self.calculeaza_probabilitati(self.words["yes"], self.prob_map[1], self.yes_counter)
        self.calculeaza_probabilitati(self.words["no"], self.prob_map[0], self.yes_counter)
        
    def adauga_cuvinte_in_lista(self):
        for key, rand in self.data.iterrows():
            if rand['A_VENIT_LA_INTERVIU'] == 'Yes':
                for coloana in self.coloane_de_interes:
                    self.words['yes'][coloana].append(rand[coloana])
                self.yes_counter += 1
            else:
                for coloana in self.coloane_de_interes:
                    self.words['no'][coloana].append(rand[coloana])
                self.no_counter += 1
        
    @staticmethod
    def calculeaza_probabilitati(liste, prob_map, total_counter):
        for coloana in liste:
            counter_dict = Counter(liste[coloana])
            for cuvant, numar_aparitii in counter_dict.items():
                prob_map[coloana][cuvant] = numar_aparitii/total_counter
                
    #def itereaza_peste_rand(self, rand, lista_cuvinte):
    #    for nume_coloana in self.coloane_de_interes:
            
    def predict(self, rand_din_data_frame):
        logs = self.inference(rand_din_data_frame)
        return np.argmax(logs)
    
    def inference(self, rand_din_data_frame):
        yes_sum = 0
        no_sum = 0
        for coloana in self.coloane_de_interes:
            valoare_din_coloana = rand_din_data_frame[coloana]
            
            for cuv in valoare_din_coloana:
                if cuv not in self.prob_map[1][coloana]:
                    yes_sum += np.log(0.00001)
                else:
                    yes_sum += np.log(self.prob_map[1][coloana][valoare_din_coloana])
                if cuv not in self.prob_map[0][coloana]:
                    no_sum += np.log(0.00001)
                else:
                    no_sum += np.log(self.prob_map[0][coloana][valoare_din_coloana])
        
        yes_sum += np.log(self.prior[1])
        no_sum += np.log(self.prior[0])
        
        return [no_sum, yes_sum]

        

In [145]:
clasificatoooorul = NaiveBayesClassifier(date, 2)
clasificatoooorul.fit()

In [147]:
def evaluate_model(model, data):
    
    from sklearn.metrics import  classification_report
    from tqdm import tqdm
    
    predictions = []
    true_values = []
    for key, row in tqdm(data.iterrows()):
        predictions.append(model.predict(row)) # aici adaug in lista predictia modelului
        true_values.append(1 if row['A_VENIT_LA_INTERVIU'] == 'Yes' else 0) # aici adaug in lista 1 daca este YES sau 0 daca este NO
        
    print(classification_report(true_values, predictions))
    
evaluate_model(clasificatoooorul, date)

965it [00:00, 2068.53it/s]

              precision    recall  f1-score   support

           0       0.30      1.00      0.47       293
           1       0.00      0.00      0.00       672

    accuracy                           0.30       965
   macro avg       0.15      0.50      0.23       965
weighted avg       0.09      0.30      0.14       965




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
