In [240]:
import itertools
import pandas as pd
from collections import Counter
import numpy as np

date = pd.read_excel('tema_2_date_interviu.xlsx')
date = date.fillna("empty")
date.columns

Index(['COMPANIA', 'INDUSTRIA', 'ID_CANDIDAT', 'SEX', 'LOCATIA_COMPANIEI',
       'LOCATIA_CANDIDATULUI', 'RASPUNS_LA_TELEFON', 'A_VENIT_LA_INTERVIU',
       'STARE_CIVILA'],
      dtype='object')

In [280]:
class NaiveBayesClassifier():
    
    def __init__(self, date, no_labels, coloane_importante):
        self.data = date
        self.no_lables = no_labels
        self.prior = [1/no_labels for _ in range(no_labels)]
        self.yes_counter = 0
        self.no_counter = 0

        #self.coloane_de_interes = [column for column in self.data.columns if column not in ['A_VENIT_LA_INTERVIU', 'ID_CANDIDAT']]
        #print(coloane_importante)
        self.coloane_de_interes = [x for x in coloane_importante]
        self.prob_map = {column:{} for column in self.coloane_de_interes}
        
        self.prob_map = [[] for _ in range(no_labels)]
        self.prob_map[0] = {column: {} for column in self.coloane_de_interes}
        self.prob_map[1] = {column: {} for column in self.coloane_de_interes}
        
        self.words = {'yes':{column: [] for column in self.coloane_de_interes}, 'no':{column: [] for column in self.coloane_de_interes}}
        
    def fit(self):
        self.adauga_cuvinte_in_lista()
        self.calculeaza_probabilitati(self.words["yes"], self.prob_map[1], self.yes_counter)
        self.calculeaza_probabilitati(self.words["no"], self.prob_map[0], self.no_counter)
        
    def adauga_cuvinte_in_lista(self):
        for key, rand in self.data.iterrows():
            if rand['A_VENIT_LA_INTERVIU'] == 'Yes':
                for coloana in self.coloane_de_interes:
                    self.words['yes'][coloana].append(rand[coloana])
                self.yes_counter += 1
            else:
                for coloana in self.coloane_de_interes:
                    self.words['no'][coloana].append(rand[coloana])
                self.no_counter += 1
        
    @staticmethod
    def calculeaza_probabilitati(liste, prob_map, total_counter):
        for coloana in liste:
            counter_dict = Counter(liste[coloana])
            for cuvant, numar_aparitii in counter_dict.items():
                prob_map[coloana][cuvant] = numar_aparitii/total_counter
                
    #def itereaza_peste_rand(self, rand, lista_cuvinte):
    #    for nume_coloana in self.coloane_de_interes:
            
    def predict(self, rand_din_data_frame):
        logs = self.inference(rand_din_data_frame)
        return np.argmax(logs)
    
    def inference(self, rand_din_data_frame):
        yes_sum = 0
        no_sum = 0
        for coloana in self.coloane_de_interes:
            valoare_din_coloana = rand_din_data_frame[coloana]
            
            if valoare_din_coloana not in self.prob_map[1][coloana]:
                yes_sum += np.log(0.00001)
            else:
                yes_sum += np.log(self.prob_map[1][coloana][valoare_din_coloana])
            
            if valoare_din_coloana not in self.prob_map[0][coloana]:
                no_sum += np.log(0.00001)
            else:
                no_sum += np.log(self.prob_map[0][coloana][valoare_din_coloana])
            
            
        yes_sum += np.log(self.prior[1])
        no_sum += np.log(self.prior[0])
        
        return [no_sum, yes_sum]

        

In [328]:
from sklearn.model_selection import train_test_split
coloane_importante = [x for x in list(date.columns) if x != 'A_VENIT_LA_INTERVIU']

date_antrenament, date_test = train_test_split(date, test_size=0.2)

#testare cu toate coloanele
clasificatoooorul = NaiveBayesClassifier(date_antrenament, 2, coloane_importante)
clasificatoooorul.fit()

def evaluate_model(model, data, print_model=True):
    
    from sklearn.metrics import  classification_report
    from tqdm import tqdm
    
    predictions = []
    true_values = []
    for key, row in tqdm(data.iterrows()):
        predictions.append(model.predict(row)) # aici adaug in lista predictia modelului
        true_values.append(1 if row['A_VENIT_LA_INTERVIU'] == 'Yes' else 0) # aici adaug in lista 1 daca este YES sau 0 daca este NO
        
    if print_model:
        print(classification_report(true_values, predictions))
    return classification_report(true_values, predictions, output_dict=True, zero_division=0.0)

evaluate_model(clasificatoooorul, date_test)



193it [00:00, 13785.77it/s]

              precision    recall  f1-score   support

           0       0.32      0.61      0.42        61
           1       0.70      0.42      0.52       132

    accuracy                           0.48       193
   macro avg       0.51      0.51      0.47       193
weighted avg       0.58      0.48      0.49       193






{'0': {'precision': 0.32456140350877194,
  'recall': 0.6065573770491803,
  'f1-score': 0.4228571428571429,
  'support': 61.0},
 '1': {'precision': 0.6962025316455697,
  'recall': 0.4166666666666667,
  'f1-score': 0.5213270142180095,
  'support': 132.0},
 'accuracy': 0.47668393782383417,
 'macro avg': {'precision': 0.5103819675771708,
  'recall': 0.5116120218579235,
  'f1-score': 0.4720920785375762,
  'support': 193.0},
 'weighted avg': {'precision': 0.5787408279339392,
  'recall': 0.47668393782383417,
  'f1-score': 0.49020441238892737,
  'support': 193.0}}

In [329]:
"""
    INCERC SA GASESC SETUL DE COLOANE CARE DA ACCURACY CEL MAI MARE (scotand coloana cu a venit la interviu)
"""
from tqdm import tqdm
elemente_combinari = [i for i in range(len(coloane_importante))]

cea_mai_tare_lista = []
cel_mai_tare_scor = 0

for sz in range(1, len(elemente_combinari) + 1):
    combinari = list(itertools.combinations(elemente_combinari, sz))
    for combinare in combinari:
        combinari_selectate = []
        for index in combinare:
            combinari_selectate.append(coloane_importante[index])
        clasificator = NaiveBayesClassifier(date_antrenament, 2, combinari_selectate)
        clasificator.fit()
        performance = evaluate_model(clasificator, date_test, print_model=False)
        acuratete = performance['accuracy']
        if acuratete > cel_mai_tare_scor:
            cea_mai_tare_lista = combinari_selectate
            cel_mai_tare_scor = acuratete

print(cea_mai_tare_lista)
print(cel_mai_tare_scor)



193it [00:00, 21446.57it/s]
193it [00:00, 24124.59it/s]
193it [00:00, 24131.06it/s]
193it [00:00, 21449.98it/s]
193it [00:00, 21442.02it/s]
193it [00:00, 21442.59it/s]
193it [00:00, 21443.73it/s]
193it [00:00, 24135.38it/s]
193it [00:00, 21449.98it/s]
193it [00:00, 19294.96it/s]
193it [00:00, 19295.88it/s]
193it [00:00, 19300.02it/s]
193it [00:00, 21449.41it/s]
193it [00:00, 21443.73it/s]
193it [00:00, 19299.56it/s]
193it [00:00, 21445.43it/s]
193it [00:00, 21444.29it/s]
193it [00:00, 19300.48it/s]
193it [00:00, 19299.56it/s]
193it [00:00, 21449.98it/s]
193it [00:00, 21449.41it/s]
193it [00:00, 19304.62it/s]
193it [00:00, 21444.86it/s]
193it [00:00, 21442.59it/s]
193it [00:00, 19300.48it/s]
193it [00:00, 19303.70it/s]
193it [00:00, 21446.57it/s]
193it [00:00, 19303.24it/s]
193it [00:00, 19297.72it/s]
193it [00:00, 16086.42it/s]
193it [00:00, 19299.10it/s]
193it [00:00, 21451.11it/s]
193it [00:00, 19293.58it/s]
193it [00:00, 19305.54it/s]
193it [00:00, 21451.68it/s]
193it [00:00, 19296.

['RASPUNS_LA_TELEFON']
0.6839378238341969



