In [1]:
import pandas as pd
import numpy as np
import csv
from decimal import Decimal

In [2]:
# Gets the set of all words in the dataframe
def get_vocab(df):

    vocab = {word for para in df['abstract'] for word in para.split() }
    return vocab

# Returns the prior probabilities of all the target classes A, B, E and V
def get_prior_probability(df):
    prior_probability = df['target'].value_counts(normalize=True).to_dict()
    for key, value in prior_probability.items():
        prior_probability[key] = Decimal(str(value))
    return prior_probability 

# Returns a dictionary containing all the words 
def get_conditional_probability(df):
    conditional_probability = dict()
    
    for row in df.itertuples(index = 0): 
     
        line = row.abstract
        target = row.target
        words = [word for word in line.split()]

        for word in words :
            
            if word in conditional_probability.keys():
                if target in conditional_probability[word].keys():
                    conditional_probability[word][target] += 1
                else:
                    conditional_probability[word][target] = 1
            else:
                conditional_probability[word] = {"B": Decimal(0), "E":Decimal(0), "A":Decimal(0), "V":Decimal(0)}
                conditional_probability[word][target] += 1
    return conditional_probability

## Convert conditional probabilities dictionary to a dataframe
def get_conditional_probability_df(conditional_probability):

    data = []
    for word, targets in conditional_probability.items():
        
        row = {'word': word}
        
        for i, (target, count) in enumerate(targets.items()):
            row [target] = count
        data.append(row)
    word_orrurance_table = pd.DataFrame(data)
    word_orrurance_table.fillna(0, inplace=True)
    return word_orrurance_table

In [3]:
# Training the model
def train(df):
    vocab = get_vocab(df)
    prior_probability = get_prior_probability(df)
    conditional_probability_dict = get_conditional_probability(df)
    conditional_probability_df = get_conditional_probability_df(conditional_probability_dict)
    return vocab, prior_probability, conditional_probability_dict,conditional_probability_df
# Testing the model
def Naive_Bayes(vocab, prior_probability, conditional_probability_dict,conditional_probability_df,testing):
    predictions = []
    row_nm = 0
    for row in testing.itertuples(index = 0):
        row_nm += 1
        line = row.abstract
        prediction = predict(vocab,prior_probability,conditional_probability_dict,conditional_probability_df,line)
        predictions.append(prediction)
    return predictions

# Predicting the target class of a given line
def predict(vocab,prior_probability,conditional_probability_dict,conditional_probability_df,line):
    words = line.split()
    
    prob_B = prior_probability['B']
    prob_E = prior_probability['E']
    prob_A = prior_probability['A']
    prob_V = prior_probability['V']

    for word in words:
        if word in vocab:
            prob_B *= conditional_probability_dict[word]['B'] 
            prob_A *= conditional_probability_dict[word]['A'] 
            prob_V *= conditional_probability_dict[word]['V'] 
            prob_E *= conditional_probability_dict[word]['E'] 
    
    max_prob = max(prob_B, prob_A, prob_V, prob_E)
    if max_prob == prob_E:
        return 'E'
    elif max_prob == prob_B:
        return 'B'
    elif max_prob == prob_A:
        return 'A'
    elif max_prob == prob_V:
        return 'V'
    else:
        return 'E'



In [4]:
from sklearn.model_selection import train_test_split

# Reading the data
df = pd.read_csv("trg.csv")
df = df.rename({'class':'target'},axis=1)

# Split the data into training and testing
training, testing = train_test_split(df, test_size=0.3, random_state=42)

vocab, prior_probability, conditional_probability_dict,conditional_probability_df = train(training)
prediction_row = Naive_Bayes(vocab, prior_probability, conditional_probability_dict,conditional_probability_df, testing)

# Calculating  the validation accuracy
validation_acc = (testing['target'] == prediction_row).mean()
print(f"Validation acc: {validation_acc:.4f}")

Validation acc: 0.8158


In [5]:
import pandas as pd
import numpy as np
import csv
from decimal import Decimal
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer



In [6]:
# Data Preprocessing
# Defining the stop words and stemming them
stopWords = set(stopwords.words('english'))
porter = PorterStemmer()
stemmed_stop_words = [porter.stem(word) for word in stopWords]
stopWords.update(stemmed_stop_words)

In [7]:
# Getting filtered set of words from the training dataframe
def Improved_get_vocab(df):
  
    vocab = {word for para in df['abstract'] for word in para.split() if word not in stopWords}
    porter = PorterStemmer()

    vocab = {porter.stem(word) for para in df['abstract'] for word in para.split() if word not in stopWords}
    return vocab
# Getting the prior probabilities of all the target classes A, B, E and V given the original dataframe
def Improved_get_prior_probability(df):
    prior_probability = df['target'].value_counts(normalize=True).to_dict()
    
    for key, value in prior_probability.items():
        prior_probability[key] = Decimal(str(value))
    return prior_probability 
# Getting the conditional probabilities of all the words in vocab 
def Improved_get_conditional_probability(df,vocab):
    conditional_probability = dict()
    for row in df.itertuples(index = 0): 
        line = row.abstract
        target = row.target
        porter = PorterStemmer()
        words = [porter.stem(word) for word in line.split() if word not in stopWords]

        for word in words :
                        
            if word in conditional_probability.keys():
                if target in conditional_probability[word].keys():
                    conditional_probability[word][target] += 1
                else:
                    conditional_probability[word][target] = 1
            else:
                conditional_probability[word] = {"B": Decimal(0), "E":Decimal(0), "A":Decimal(0), "V":Decimal(0)}
                conditional_probability[word][target] += 1
    return conditional_probability
# Converint the conditional probabilities dictionary to a dataframe
def Improved_get_conditional_probability_df(conditional_probability):

    data = []
    for word, targets in conditional_probability.items():
        
        row = {'word': word}
        
        for i, (target, count) in enumerate(targets.items()):
            row [target] = count
        data.append(row)
    word_orrurance_table = pd.DataFrame(data)
    word_orrurance_table.fillna(0, inplace=True)
    return word_orrurance_table



In [8]:


# Training the model
def Improved_train(df, original_df):
    
    vocab = Improved_get_vocab(df)
    prior_probability = Improved_get_prior_probability(original_df)
    conditional_probability_dict = Improved_get_conditional_probability(df,vocab)
    conditional_probability_df = Improved_get_conditional_probability_df(conditional_probability_dict)
    return vocab, prior_probability, conditional_probability_dict,conditional_probability_df

# Testing the model
def Improved_Naive_Bayes(vocab, prior_probability, conditional_probability_dict,conditional_probability_df,testing):
    print("Predicting....") 
    predictions = []
    row_nm = 0
    for row in testing.itertuples(index = 0):
        row_nm += 1
        line = row.abstract
        prediction = Improved_predict(vocab,prior_probability,conditional_probability_dict,conditional_probability_df,line)
        predictions.append(prediction)
    return predictions
# Predicting the target class of a given line
def Improved_predict(vocab,prior_probability,conditional_probability_dict,conditional_probability_df,line):
    words = line.split()
    porter = PorterStemmer()
    words = [ porter.stem(word) for word in words if word not in stopWords]
    prob_B = prior_probability['B'].ln()
    prob_E = prior_probability['E'].ln()
    prob_A = prior_probability['A'].ln()
    prob_V = prior_probability['V'].ln()

    total_B = conditional_probability_df['B'].sum()
    total_E = conditional_probability_df['E'].sum()
    total_A = conditional_probability_df['A'].sum()
    total_V = conditional_probability_df['V'].sum()
    for word in words:
        if word in vocab:
            prob_B += (conditional_probability_dict[word]['B'] + 1).ln() - (Decimal(total_B +  len(vocab))).ln()
            prob_A += (conditional_probability_dict[word]['A'] + 1).ln() - (Decimal(total_A + len(vocab))).ln()
            prob_V += (conditional_probability_dict[word]['V'] + 1).ln() - (Decimal(total_V + len(vocab))).ln()
            prob_E += (conditional_probability_dict[word]['E'] + 1).ln() - (Decimal(total_E + len(vocab))).ln()
   
    
    max_prob = max(prob_B, prob_A, prob_V, prob_E)
    if max_prob == prob_E:
        return 'E'
    elif max_prob == prob_B:
        return 'B'
    elif max_prob == prob_A:
        return 'A'
    elif max_prob == prob_V:
        return 'V'
    else:
        return 'E'
        
   

In [9]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler   

# Reading the data
df = pd.read_csv("trg.csv")
df = df.rename({'class':'target'},axis=1)

# Splitting the data into training and testing
training, testing = train_test_split(df, test_size=0.3, random_state=42)

# Oversampling the training data
X = training.drop('target', axis=1)
y = training['target']
oversampler = RandomOverSampler(random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(X, y)
traning_resampled = pd.concat([X_resampled, y_resampled], axis=1)
df_resampled = traning_resampled.sample(frac=1, random_state=42)

# Training the model
vocab, prior_probability, conditional_probability_dict,conditional_probability_df = Improved_train(traning_resampled,training)

# Testing the model
prediction_row = Improved_Naive_Bayes(vocab, prior_probability, conditional_probability_dict,conditional_probability_df, testing)

# Calculating  the accuracy
validation_acc = (testing['target'] == prediction_row).mean()
print(f"Validation acc: {validation_acc:.4f}")

Predicting....
Validation acc: 0.9717


In [10]:
# Reading the data
df = pd.read_csv("trg.csv")
df = df.rename({'class':'target'},axis=1)
test_df = pd.read_csv("tst.csv")
X = df.drop('target', axis=1)
y = df['target']

# Oversampling the training data
oversampler = RandomOverSampler(random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(X, y)
df_resampled = pd.concat([X_resampled, y_resampled], axis=1)
df_resampled = df_resampled.sample(frac=1, random_state=42)

# Training the model
vocab, prior_probability, conditional_probability_dict,conditional_probability_df = Improved_train(df_resampled, df)

# Getting predictions
prediction_row = Improved_Naive_Bayes(vocab, prior_probability, conditional_probability_dict,conditional_probability_df, test_df)

# Writing the predictions to a csv file
with open("tst.csv","rt") as source:
    rdr = csv.reader(source)
    c = -1
    with open("raar518.csv","wt", newline='') as result:
        wtr = csv.writer( result )
        for r in rdr:
            if (c==-1): wtr.writerow( (r[0], "class") ) #add csv header
            else: wtr.writerow( (r[0], prediction_row[c]) )
            c+=1

print("Done")

Predicting....
Done
