# Importing and PreProcessing  Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

data_path = "/content/drive/MyDrive/NLP_CA2/training_1600000_processed_noemoticon.csv"

df = pd.read_csv(data_path , encoding = 'latin-1')

In [None]:
df.rename(columns={'0': 'target',
                   '1467810369': 'ids' ,
                   'Mon Apr 06 22:19:45 PDT 2009' : 'date' ,
                   'NO_QUERY' : 'flag' ,
                   '_TheSpecialOne_' : 'user' ,
                   '@switchfoot http://twitpic.com/2y1zl - Awww, that\'s a bummer.  You shoulda got David Carr of Third Day to do it. ;D' : 'text'

}, inplace=True)

In [None]:
# df.head().where(df[df['target'] == 4])

# df['target' == 4]
df.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


**split dataset into partitions**

In [None]:
import pandas as pd

positive_samples = df[df['target'] == 4].sample(n=5000, random_state = 62)
negative_samples = df[df['target'] == 0].sample(n=5000, random_state = 62)

selected_samples = pd.concat([positive_samples, negative_samples])

random_samples = selected_samples.sample(n = 2000 , random_state = 42)

**tokenization**

In [None]:
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize

from nltk.corpus import stopwords

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
def remove_punctuations(text):
  # initializing punctuations string
  punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''

  for ele in text:
      if ele in punc:
          text = text.replace(ele  ,  "")
  return text

def tokenize(text):
    words = text.lower().split()
    words = remove_punctuations(text)
    stems = stemmer.stem(text)
    filtered_words = [word for word in stems if word not in stopwords.words('english')]
    return set(filtered_words)

stemmer = PorterStemmer()
vocabulary = set()
vocab_positive = set()
vocab_negative = set()


for index , row in selected_samples.iterrows() :
    # print(f'original text             : {text}')
    # print(item)
    text = row['text']
    target = row['target']

    text = remove_punctuations(text)

    # print(f'text without punctuations : {text}')

    stems = stemmer.stem(text)
    # print(f'stems                     : {stems}')
    # print('------------------------------------')

    if target == 0:
      vocab_negative.update(tokenize(stems))

    else:
      vocab_positive.update(tokenize(stems))

    vocabulary.update(tokenize(stems))

print("Length of Vocabulary:", len(vocabulary))
print("Length of Negative Vocabulary:", len(vocab_negative))
print("Length of Positive Vocabulary:", len(vocab_positive))


Length of Vocabulary: 104
Length of Negative Vocabulary: 68
Length of Positive Vocabulary: 99


#**Build Document Term Matrix**

In [None]:
def build_document_term_matrix(data , vocabulary):
    matrix = []
    for index , row in data.iterrows():
        ids = row['ids']
        text = row['text']

        word_count = {word : 0 for word in vocabulary}
        tokens = tokenize(text)

        for token in tokens :
          if token in vocabulary:
            word_count[token] += 1

        matrix.append(list(word_count.values()))

    return matrix

# Build document-term matrix
document_term_negative = build_document_term_matrix(negative_samples , vocab_negative)
document_term_positive = build_document_term_matrix(positive_samples , vocab_positive)

# print(f'length of document_term_negative : {len(document_term_negative)}')
# print(f'length of document_term_positive : {len(document_term_positive)}')

**naive bayes (doucment term matrix)**

In [None]:
import numpy as np


class naive_bayes_1:
  def __init__(self , document_term_negative , document_term_positive , vocab_positive , vocab_negative):
    self.prior = {}
    self.likelihood = {}

    self.positives = document_term_positive
    self.negatives = document_term_negative

    self.vocab_positive = vocab_positive
    self.vocab_negative = vocab_negative


  def fit(self , X_train , y_train):
    classes , counts = np.unique(y_train , return_counts = True)

    total_samples = len(y_train)
    for c, count in zip(classes, counts):
        self.prior[c] = count / total_samples

    unq_words_count_positive = len(vocab_positive)
    unq_words_count_negative = len(vocab_negative)

    matrix  = np.array(self.positives)
    matrix2 = np.array(self.negatives)

    all_words_count_positive = np.sum(matrix)
    all_words_count_negative = np.sum(matrix2)

    # print(f'shape of matrix {matrix.shape}')
    # print(f'length of positives : {len(self.positives)}')

    # for i in range(0 , len(self.positives)):
    #   all_words_count_positive += np.sum(matrix[: , i])

    # for i in range(0 , len(self.negatives)):
    #   all_words_count_negative += np.sum(matrix[: , i])

    # matrix = np.array(self.positives)

    self.likelihood[4] = {}
    self.likelihood[0] = {}

    index = 0
    for word in self.vocab_positive:
      column_sum = np.sum(matrix[:, index])
      self.likelihood[4][word] = (1 + column_sum) / (all_words_count_positive + unq_words_count_positive)
      index += 1


    index = 0
    for word in self.vocab_negative:
      column_sum = np.sum(matrix2[:, index])
      self.likelihood[0][word] = (1 + column_sum) / (all_words_count_negative + unq_words_count_negative)
      index += 1


  def predict(self , X_test):
    X_test = np.array(X_test)

    df = pd.DataFrame(X_test , columns = ['ids', 'date', 'flag' , 'user' , 'text'])
    df.head()

    predicted_labels = []

    for index , row in df.iterrows():
      class_scores = {}
      class_scores[0] = np.log(self.prior[0])
      class_scores[4] = np.log(self.prior[4])

      # print(f'row.columns : {row.columns}' )
      # ids  = row['ids']
      text = row['text']
      text = tokenize(text)

      for word in text:
        if word in vocab_negative:
              class_scores[0] += np.log(self.likelihood[0][word])

        if word in vocab_positive:
              class_scores[4] += np.log(self.likelihood[4][word])

      predicted_labels.append(0 if class_scores[0] > class_scores[4] else 4)

    return predicted_labels


  def calculate_metrics(self , y_pred, true_labels):
    true_positives = sum((pred == 4 and true == 4) for pred, true in zip(y_pred, true_labels))
    false_positives = sum((pred == 4 and true == 0) for pred, true in zip(y_pred, true_labels))
    false_negatives = sum((pred == 0 and true == 4) for pred, true in zip(y_pred, true_labels))

    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return precision, recall, f1_score

**Using Naive Bayes Model**

In [None]:
from sklearn.model_selection import train_test_split


random_samples = selected_samples.sample(n = 2000 , random_state = 42)

# Split data into features and labels
X = selected_samples.drop('target', axis=1).values
y = selected_samples['target'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


nb = naive_bayes_1(document_term_negative , document_term_positive , vocab_positive , vocab_negative)
nb.fit(X_train , y_train)

predicted_labels = nb.predict(X_test)
true_labels = y_test

precision, recall, f1_score = nb.calculate_metrics(predicted_labels , true_labels)
print(f'precision : {precision}')
print(f'recall : {recall}')
print(f'f1_score : {f1_score}')

precision : 0.6042345276872965
recall : 0.366600790513834
f1_score : 0.45633456334563344


# **calculate tf-idf**

**calculate TF-IDF**

In [None]:
import math
import numpy as np

def calculate_tfidf(matrix):
    num_documents = len(matrix)
    word_count_per_document = np.sum(np.array(matrix) > 0, axis=0)

    idf_values = {j: math.log(num_documents / count) if count != 0 else 0 for j, count in enumerate(word_count_per_document)}

    tfidf_matrix = np.zeros_like(matrix, dtype=float)
    for i, row in enumerate(matrix):
        for j, count in enumerate(row):
            if count > 0:
                tf = count / np.sum(row)
                tfidf_matrix[i, j] = tf * idf_values.get(j, 0)

    return tfidf_matrix




# Calculate TF-IDF values
tfidf_matrix_negative = calculate_tfidf(document_term_negative)
tfidf_matrix_positive = calculate_tfidf(document_term_positive)

print(f'length of tfidf_matrix_positive : {len(tfidf_matrix_positive)}')
print(f'length of tfidf_matrix_negative : {len(tfidf_matrix_negative)}')

# Print the TF-IDF matrix
# for row in tfidf_matrix:
#     print(row)

**naive bayes with TF-IDF Matrix**

In [None]:
import numpy as np


class naive_bayes_2:

  def __init__(self , document_term_negative , document_term_positive , vocab_positive , vocab_negative):
    self.prior = {}
    self.likelihood = {}

    self.positives = document_term_positive
    self.negatives = document_term_negative

    self.vocab_positive = vocab_positive
    self.vocab_negative = vocab_negative


  def fit(self , X_train , y_train):
    classes , counts = np.unique(y_train , return_counts = True)

    total_samples = len(y_train)
    for c, count in zip(classes, counts):
        self.prior[c] = count / total_samples

    unq_words_count_positive = len(vocab_positive)
    unq_words_count_negative = len(vocab_negative)


    all_words_count_positive = 0
    all_words_count_negative = 0

    matrix  = np.array(self.positives)
    matrix2 = np.array(self.negatives)

    all_words_tfidf_positive = np.sum(matrix)
    all_words_tfidf_negative = np.sum(matrix2)

    # for i in range(len(self.positives)):
    #   all_words_count_positive += np.sum(matrix[: , i])

    # for i in range(len(self.negatives)):
    #   all_words_count_negative += np.sum(matrix[: , i])

    # matrix = np.array(self.positives)

    self.likelihood[4] = {}
    self.likelihood[0] = {}

    index = 0
    for word in self.vocab_positive:
      column_sum = np.sum(matrix[:, index])
      self.likelihood[4][word] = (1 + column_sum) / (all_words_tfidf_positive + unq_words_count_positive)
      index += 1


    index = 0
    for word in self.vocab_negative:
      column_sum = np.sum(matrix2[:, index])
      self.likelihood[0][word] = (1 + column_sum) / (all_words_tfidf_negative + unq_words_count_negative)
      index += 1


  def predict(self , X_test):
    X_test = np.array(X_test)

    df = pd.DataFrame(X_test , columns = ['ids', 'date', 'flag' , 'user' , 'text'])
    df.head()

    predicted_labels = []# print(f'length of document_matrix : {len(document_term_matrix)}')
# Print the document-term matrix
# for row in document_term_matrix:
#     print(row)

    for index , row in df.iterrows():
      class_scores = {}
      class_scores[0] = np.log(self.prior[0])
      class_scores[4] = np.log(self.prior[4])

      # print(f'row.columns : {row.columns}' )
      # ids  = row['ids']
      text = row['text']
      text = tokenize(text)

      for word in text:
        if word in vocab_negative:
              class_scores[0] += np.log(self.likelihood[0][word])

        if word in vocab_positive:
              class_scores[4] += np.log(self.likelihood[4][word])

      predicted_labels.append(0 if class_scores[0] > class_scores[4] else 4)

    return predicted_labels


  def calculate_metrics(self , y_pred, true_labels):
    true_positives = sum((pred == 4 and true == 4) for pred, true in zip(y_pred, true_labels))
    false_positives = sum((pred == 4 and true == 0) for pred, true in zip(y_pred, true_labels))
    false_negatives = sum((pred == 0 and true == 4) for pred, true in zip(y_pred, true_labels))

    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0

    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return precision, recall, f1_score

**Using naive bayes with TF-IDF Matrix**

In [None]:
from sklearn.model_selection import train_test_split


random_samples = selected_samples.sample(n = 2000 , random_state = 42)

# Split data into features and labels
X = selected_samples.drop('target', axis=1).values
y = selected_samples['target'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


nb = naive_bayes_1(tfidf_matrix_negative , tfidf_matrix_positive , vocab_positive , vocab_negative)
nb.fit(X_train , y_train)

predicted_labels = nb.predict(X_test)
true_labels = y_test

precision, recall, f1_score = nb.calculate_metrics(predicted_labels , true_labels)
print(f'precision : {precision}')
print(f'recall : {recall}')
print(f'f1_score : {f1_score}')

precision : 0.7142857142857143
recall : 0.004940711462450593
f1_score : 0.009813542688910697


# **ppmi matrix**

In [18]:
import numpy as np

def build_ppmi_matrix(df , vocabulary):
  co_occurence = {}
  vocab_size = len(vocabulary)
  vocabulary = list(vocabulary)

  co_occurrence_matrix = np.zeros((vocab_size, vocab_size))

  for index, row in df.iterrows():
    tokenized = tokenize(row['text'])

    for x in tokenized:
      for y in tokenized:
        if x in vocabulary and y in vocabulary:
          i = vocabulary.index(x)
          j = vocabulary.index(y)

        if x == y:
          if x not in co_occurence.keys():
            co_occurence[x] = 1
            co_occurrence_matrix[i][i] = 1

          else:
            co_occurence[x] = co_occurence[x] + 1
            co_occurrence_matrix[i][i] = co_occurrence_matrix[i][i] + 1

          continue

        # print(x , y)

        key  = (x , y)
        key2 = (y , x)

        if key not in co_occurence.keys():
          co_occurence[key] = 1
          co_occurence[key2] = 1
          co_occurrence_matrix[i][j] = 1
          co_occurrence_matrix[j][i] = 1


        else:
          co_occurence[key] = co_occurence[key] + 1
          co_occurence[key2] = co_occurence[key2] + 1
          co_occurrence_matrix[i][j] = co_occurrence_matrix[i][j] + 1
          co_occurrence_matrix[j][i] = co_occurrence_matrix[j][i] + 1



  co_occurrence_matrix = np.nan_to_num(co_occurrence_matrix)

  # Calculate probabilities
  word_counts = np.sum(co_occurrence_matrix, axis=1)
  total_word_count = np.sum(word_counts)
  P_A = word_counts / total_word_count
  P_B = word_counts / total_word_count
  P_A_B = co_occurrence_matrix / total_word_count

  # Calculate PMI
  PMI_matrix = np.log(P_A_B / (P_A[:, None] * P_B))

  # Calculate PPMI
  PPMI_matrix = np.maximum(PMI_matrix, 0)

  # PPMI_matrix = np.nan_to_num(PPMI_matrix)

  return PPMI_matrix

ppmi_matrix_negative = build_ppmi_matrix(negative_samples , vocab_negative)
ppmi_matrix_positive = build_ppmi_matrix(positive_samples , vocab_positive)

print(f'ppmi_matrix_negative : {ppmi_matrix_negative[0 : 100]}')

  PMI_matrix = np.log(P_A_B / (P_A[:, None] * P_B))


ppmi_matrix_negative : [[7.57613742 0.         0.         ... 0.         0.         0.        ]
 [0.         4.77498898 0.         ... 4.93082885 5.2847603  0.74427915]
 [0.         0.         0.26339893 ... 0.         0.         0.        ]
 ...
 [0.         4.93082885 0.         ... 6.3786524  0.         1.58119362]
 [0.         5.2847603  0.         ... 0.         6.28003945 0.60012401]
 [0.         0.74427915 0.         ... 1.58119362 0.60012401 0.        ]]


**Naive Bayes Model**

In [None]:
import numpy as np

class naive_bayes_3:
  def __init__(self , document_term_negative , document_term_positive , vocab_positive , vocab_negative):
    self.prior = {}
    self.likelihood = {}

    self.positives = document_term_positive
    self.negatives = document_term_negative

    self.vocab_positive = list(vocab_positive)
    self.vocab_negative = list(vocab_negative)


  def fit(self , X_train , y_train):
    classes , counts = np.unique(y_train , return_counts = True)

    total_samples = len(y_train)
    for c, count in zip(classes, counts):
        self.prior[c] = count / total_samples

    unq_words_count_positive = len(vocab_positive)
    unq_words_count_negative = len(vocab_negative)

    matrix  = np.array(self.positives)
    matrix2 = np.array(self.negatives)

    all_words_ppmi_positive = np.sum(matrix)
    all_words_ppmi_negative = np.sum(matrix2)

    # self.likelihood[4] = {}
    # self.likelihood[0] = {}

    # index = 0
    # for word in self.vocab_positive:
    #   column_sum = np.sum(matrix[:, index])
    #   # print(f'column_sum : {column_sum}')
    #   self.likelihood[4][word] = (1 + column_sum) / (all_words_ppmi_positive + unq_words_count_positive)
    #   index += 1


    # index = 0
    # for word in self.vocab_negative:
    #   column_sum = np.sum(matrix2[:, index])
    #   # print(f'column_sum : {column_sum}')
    #   self.likelihood[0][word] = (1 + column_sum) / (all_words_ppmi_negative + unq_words_count_negative)
    #   index += 1


  def predict(self , X_test):
    X_test = np.array(X_test)

    df = pd.DataFrame(X_test , columns = ['ids', 'date', 'flag' , 'user' , 'text'])
    df.head()

    predicted_labels = []

    for index , row in df.iterrows():
      class_scores = {}
      class_scores[0] = np.log(self.prior[0])
      class_scores[4] = np.log(self.prior[4])

      text = row['text']
      text = tokenize(text)

      length_of_sentence = len(text)

      desired_positive = np.zeros(len(self.vocab_positive))
      desired_negative = np.zeros(len(self.vocab_negative))

      inedexes_positive = []
      indexes_negative  = []

      for word in text:
        if word in vocab_negative:
              index = self.vocab_negative.index(word)
              indexes_negative.append(index)
              desired_negative = desired_negative + self.negatives[index]
              # class_scores[0] += np.log(self.likelihood[0][word])

        if word in vocab_positive:
              index = self.vocab_positive.index(word)
              inedexes_positive.append(index)
              desired_positive = desired_positive + self.positives[index]
              # class_scores[4] += np.log(self.likelihood[4][word])

      desired_negative = desired_negative / length_of_sentence
      desired_positive = desired_positive / length_of_sentence

      class_scores[0] += sum(np.log(x) if index in indexes_negative else 0 for index , x in enumerate(desired_negative))
      class_scores[4] += sum(np.log(x) if index in inedexes_positive else 0 for index , x in enumerate(desired_positive))


      predicted_labels.append(0 if class_scores[0] > class_scores[4] else 4)

    return predicted_labels


  def calculate_metrics(self , y_pred, true_labels):
    true_positives = sum((pred == 4 and true == 4) for pred, true in zip(y_pred, true_labels))
    false_positives = sum((pred == 4 and true == 0) for pred, true in zip(y_pred, true_labels))
    false_negatives = sum((pred == 0 and true == 4) for pred, true in zip(y_pred, true_labels))

    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return precision, recall, f1_score

In [None]:
from sklearn.model_selection import train_test_split

# Split data into features and labels
X = selected_samples.drop('target', axis=1).values
y = selected_samples['target'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 52)


nb = naive_bayes_3(ppmi_matrix_negative , ppmi_matrix_positive , vocab_positive , vocab_negative)
nb.fit(X_train , y_train)

predicted_labels = nb.predict(X_test)
true_labels = y_test

precision, recall, f1_score = nb.calculate_metrics(predicted_labels , true_labels)
print(f'precision : {precision}')
print(f'recall : {recall}')
print(f'f1_score : {f1_score}')

  class_scores[4] += sum(np.log(x) if index in inedexes_positive else 0 for index , x in enumerate(desired_positive))
  class_scores[0] += sum(np.log(x) if index in indexes_negative else 0 for index , x in enumerate(desired_negative))


precision : 0.46464646464646464
recall : 0.046890927624872576
f1_score : 0.08518518518518518
