In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv


## Part 1.1

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import re
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')
STOPWORDS = set(stopwords.words('english'))

# Adjust the file path and add encoding to deal with potential UnicodeDecodeError
df = pd.read_csv('/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1', header=None,
                 names=['target', 'ids', 'date', 'flag', 'user', 'text'])

# Sample selection
df_negative = df[df['target'] == 0].sample(5000)
df_positive = df[df['target'] == 4].sample(5000)
df_sampled = pd.concat([df_negative, df_positive])

# Split dataset into training and test sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(df_sampled['text'], df_sampled['target'], test_size=0.2, random_state=42)

# Preprocessing function
def preprocess_text(text):
    text = text.lower()  # normalization: lowercasing
    text = re.sub(r'\W', ' ', text)  # remove all non-word characters
    text = re.sub(r'\s+', ' ', text)  # replace multiple spaces with a single space
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)  # remove stopwords
    return text

# Apply preprocessing
X_train_processed = X_train.apply(preprocess_text)
X_test_processed = X_test.apply(preprocess_text)

# Tokenization using CountVectorizer
vectorizer = CountVectorizer()
X_train_vectors = vectorizer.fit_transform(X_train_processed)
X_test_vectors = vectorizer.transform(X_test_processed)

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Part 1.2

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
import nltk


nltk.download('stopwords')
STOPWORDS = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()  
    text = re.sub(r'\W', ' ', text)  
    text = re.sub(r'\s+', ' ', text)  
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)  
    return text

df = pd.read_csv('/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1', header=None,
                 names=['target', 'ids', 'date', 'flag', 'user', 'text'])

df['text'] = df['text'].apply(preprocess_text)
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['target'], test_size=0.2, random_state=42)
vectorizer = CountVectorizer()
X_train_tf = vectorizer.fit_transform(X_train)

tf_slice = X_train_tf[:50, :50].toarray()
feature_names = vectorizer.get_feature_names_out()[:50]
tf_df = pd.DataFrame(tf_slice, columns=feature_names, index=np.arange(1, 51))
tf_df.index.name = 'Document Index'
print(tf_df)

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
                00  000  0000  00000  0000001  000001  000014  \
Document Index                                                  
1                0    0     0      0        0       0       0   
2                0    0     0      0        0       0       0   
3                0    0     0      0        0       0       0   
4                0    0     0      0        0       0       0   
5                0    0     0      0        0       0       0   
6                0    0     0      0        0       0       0   
7                0    0     0      0        0       0       0   
8                0    0     0      0        0       0       0   
9                0    0     0      0        0       0       0   
10               0    0     0      0        0       0       0   
11               0    0     0      0        0       0       0   
12               0    0     0  

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
import nltk


def preprocess_text(text):
    text = text.lower()  
    text = re.sub(r'\W', ' ', text)  
    text = re.sub(r'\s+', ' ', text)  
    text = ' '.join(word for word in text.split() if word not in set(stopwords.words('english')))  # Remove stopwords
    return text


vectorizer = CountVectorizer()
X_train_processed = X_train.apply(preprocess_text)
X_train_tf = vectorizer.fit_transform(X_train_processed)


non_zero_rows, non_zero_cols = X_train_tf.nonzero()
selected_rows = np.unique(non_zero_rows[:400])[:20]  
selected_cols = np.unique(non_zero_cols[:400])[:20]  
tf_slice = X_train_tf[selected_rows, :][:, selected_cols].toarray()
selected_feature_names = np.array(vectorizer.get_feature_names_out())[selected_cols]
tf_df = pd.DataFrame(tf_slice, columns=selected_feature_names, index=selected_rows)
tf_df.index.name = 'Document'
print(tf_df)

          16  23  50  aanddfilms  abacab1975  abbey  ached  acheing  affair  \
Document                                                                      
0          0   0   0           0           0      0      0        0       0   
1          1   0   0           0           0      0      0        0       0   
2          0   0   0           0           0      0      0        0       0   
3          0   0   0           0           0      0      0        0       0   
4          0   0   0           0           0      0      0        0       0   
5          0   0   0           0           0      0      0        0       0   
6          0   0   0           0           0      0      0        0       0   
7          0   0   0           0           0      0      0        0       0   
8          0   0   0           0           0      0      0        0       0   
9          0   0   0           0           0      1      0        0       0   
10         0   0   0           0           0      0 

## Part1.3

In [4]:
import math
from collections import Counter

def compute_tf(text):
    word_counts = Counter(text.split())
    total_words = sum(word_counts.values())
    return {word: count / total_words for word, count in word_counts.items()}

def compute_idf(documents):
    N = len(documents)
    idf = {}
    all_words = set(word for document in documents for word in document.split())
    word_doc_counts = Counter(word for document in documents for word in set(document.split()))
    
    for word in all_words:
        idf[word] = math.log(N / (1 + word_doc_counts[word])) + 1  # with smoothing
    return idf

def compute_tfidf(tfs, idfs):
    tfidf = {}
    for word, tf in tfs.items():
        tfidf[word] = tf * idfs.get(word, 0)  
    return tfidf

tfs_train = [compute_tf(doc) for doc in X_train_processed]
idfs = compute_idf(X_train_processed)
tfidfs_train = [compute_tfidf(tf, idfs) for tf in tfs_train]
features = sorted(list(idfs.keys()))
tfidf_matrix = np.zeros((len(tfidfs_train), len(features)))

for i, doc_tfidfs in enumerate(tfidfs_train):
    for word, score in doc_tfidfs.items():
        if word in features:
            tfidf_matrix[i, features.index(word)] = score

tfidf_df = pd.DataFrame(tfidf_matrix, columns=features)
tfidf_df

Unnamed: 0,0,00,000,000s,000th,002,00am,02,030,0410,...,ð½ð¾,ð½ñ,ð¾,ð¾ð,ð¾ð²ðºðµ,ð¾ð²ð¾ð,ð¾ð¼,ð¾ð½ñ,ð¾ñ,ñ
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Part 1.4

In [5]:
from collections import defaultdict
import math

doc_freq = defaultdict(int)
for doc in X_train_processed:
    words = set(doc.split())
    for word in words:
        doc_freq[word] += 1

co_occurrence = defaultdict(lambda: defaultdict(int))
N = len(X_train_processed)
for doc in X_train_processed:
    words = doc.split()
    for i in range(len(words)):
        for j in range(i + 1, len(words)):
            if words[i] != words[j]:
                co_occurrence[words[i]][words[j]] += 1
                co_occurrence[words[j]][words[i]] += 1

ppmi = defaultdict(lambda: defaultdict(int))
epsilon = 1e-10  # Handeling ZeroDivision Error
for word_i in co_occurrence:
    for word_j in co_occurrence[word_i]:
        p_ij = co_occurrence[word_i][word_j] / N
        p_i = doc_freq[word_i] / N
        p_j = doc_freq[word_j] / N
        pmi = math.log2((p_ij + epsilon) / (p_i * p_j + epsilon))
        ppmi[word_i][word_j] = max(pmi, 0)

unique_words = list(doc_freq.keys())

ppmi_matrix = np.zeros((len(unique_words), len(unique_words)))
for i, word_i in enumerate(unique_words):
    for j, word_j in enumerate(unique_words):
        ppmi_matrix[i, j] = ppmi[word_i][word_j]

ppmi_df = pd.DataFrame(ppmi_matrix, index=unique_words, columns=unique_words)
ppmi_df.iloc[:20, :20]

Unnamed: 0,people,messing,like,tat2dsteelergal,fun,easy,go,think,puppy,mowgli3,gets,named,months,4,ago,day,16,years,dog,us
people,0.0,5.351028,1.396878,6.350981,0.859221,1.958754,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.029146,0.0,0.0,0.0,1.89164,0.0,0.0
messing,5.351028,0.0,4.426614,11.961176,4.888936,7.573248,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
like,1.396878,4.426614,0.0,4.426602,0.0,0.034308,0.0,0.210688,2.104694,0.0,0.0,0.0,0.0,0.0,0.782769,0.050759,0.0,0.0,0.671738,0.30561
tat2dsteelergal,6.350981,11.961176,4.426602,0.0,5.888901,8.573028,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fun,0.859221,4.888936,0.0,5.888901,0.0,2.496649,0.0,0.088069,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.030987
easy,1.958754,7.573248,0.034308,8.573028,2.496649,0.0,1.297342,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
go,0.0,0.0,0.0,0.0,0.0,1.297342,0.0,0.058685,2.367727,4.689631,1.104697,2.689654,0.782768,1.175086,1.045803,0.445037,2.367729,0.230228,1.519734,0.831679
think,0.0,0.0,0.210688,0.0,0.088069,0.0,0.058685,0.0,2.842949,5.164844,1.579921,4.164875,2.257992,0.427919,1.521027,0.644628,1.842953,0.705452,0.409997,1.043869
puppy,0.0,0.0,2.104694,0.0,0.0,0.0,2.367727,2.842949,0.0,10.642012,6.058818,8.643396,6.736844,4.321906,5.999927,1.875669,7.321745,6.184342,6.888901,4.200893
mowgli3,0.0,0.0,0.0,0.0,0.0,0.0,4.689631,5.164844,10.642012,0.0,8.380438,10.963479,9.058279,6.643742,8.32156,4.19758,9.642934,8.505934,8.210556,6.522736


## Part 1.5

In [6]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_recall_fscore_support
from collections import defaultdict
import math

def ppmi_to_matrix(ppmi, unique_words):
    ppmi_matrix = np.zeros((len(unique_words), len(unique_words)))
    for i, word_i in enumerate(unique_words):
        for j, word_j in enumerate(unique_words):
            ppmi_matrix[i, j] = ppmi[word_i].get(word_j, 0)
    return ppmi_matrix

def documents_to_feature_vectors(documents, ppmi, unique_words):
    feature_vectors = np.zeros((len(documents), len(unique_words)))
    for doc_idx, doc in enumerate(documents):
        words = set(doc.split())
        for word in words:
            if word in unique_words:
                word_idx = unique_words.index(word)
                feature_vectors[doc_idx, word_idx] = sum(ppmi[word].values())
    return feature_vectors

unique_words = list(doc_freq.keys())  
ppmi_matrix = ppmi_to_matrix(ppmi, unique_words)

X_train_ppmi_features = documents_to_feature_vectors(X_train_processed.tolist(), ppmi, unique_words)
X_test_ppmi_features = documents_to_feature_vectors(X_test_processed.tolist(), ppmi, unique_words)


clf_ppmi = GaussianNB()
clf_ppmi.fit(X_train_ppmi_features, y_train)


predictions_ppmi = clf_ppmi.predict(X_test_ppmi_features)
precision_ppmi, recall_ppmi, f1_score_ppmi, _ = precision_recall_fscore_support(y_test, predictions_ppmi, average='weighted')

print(f'PPMI - Precision: {precision_ppmi}, Recall: {recall_ppmi}, F1-Score: {f1_score_ppmi}')

PPMI - Precision: 0.5775232882438442, Recall: 0.5605, F1-Score: 0.5294613727135425


In [7]:
nltk.download('stopwords')
STOPWORDS = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()  
    text = re.sub(r'\W', ' ', text)  
    text = re.sub(r'\s+', ' ', text)  
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)  
    return text

df = pd.read_csv('/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1', header=None,
                 names=['target', 'ids', 'date', 'flag', 'user', 'text'])
df['text'] = df['text'].apply(preprocess_text)
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['target'], test_size=0.2, random_state=42)

vectorizer = CountVectorizer()
X_train_tf = vectorizer.fit_transform(X_train)
X_test_tf = vectorizer.transform(X_test)

clf_tf = MultinomialNB()
clf_tf.fit(X_train_tf, y_train)

predictions_tf = clf_tf.predict(X_test_tf)
precision_tf, recall_tf, f1_score_tf, _ = precision_recall_fscore_support(y_test, predictions_tf, average='weighted')

print(f'Term Frequency - Precision: {precision_tf}, Recall: {recall_tf}, F1-Score: {f1_score_tf}')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Term Frequency - Precision: 0.7715001155897544, Recall: 0.7704375, F1-Score: 0.7702442832482842
