In [1]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score 

import pandas as pd
import glob
import os


## Load data

In [78]:
DATA_DIR = '../../data/merged_split/'
dir_1960 = os.path.join(DATA_DIR, "1960")
dir_1970 = os.path.join(DATA_DIR, "1970")
dir_1980 = os.path.join(DATA_DIR, "1980")
dir_1990 = os.path.join(DATA_DIR, "1990")

def read_data(data_dir):
    df = pd.concat(map(pd.read_csv,glob.glob(data_dir+"/*.csv")))
    return df[['text_split','labels']]
    
df_1960 = read_data(dir_1960)

print(df_1960['labels'].value_counts())
df_1960.loc[df_1960['labels']==2,'labels']=1
print(df_1960['labels'].value_counts())

2    5253
1    2697
0    2441
Name: labels, dtype: int64
1    7950
0    2441
Name: labels, dtype: int64


In [61]:
df_1990['labels'].value_counts()

0    2814
1    1655
Name: labels, dtype: int64

## Split dataset

In [79]:
X = df_1960.text_split.values
y = df_1960.labels.values


X_train, X_val, y_train, y_val =\
    train_test_split(X, y, test_size=0.2, random_state=2022)

In [86]:
model_tfidf = make_pipeline(TfidfVectorizer(ngram_range=(1, 3)), MultinomialNB())
model_count = make_pipeline(CountVectorizer(ngram_range=(1, 3)), MultinomialNB())

In [81]:
model_tfidf.fit(X_train, y_train)


In [82]:
model_count.fit(X_train, y_train)

In [83]:
y_pred_tfidf = model_tfidf.predict(X_val)
y_pred_count = model_count.predict(X_val)

In [84]:
f1 = f1_score(y_val, y_pred_tfidf, average='weighted')
accuracy = accuracy_score(y_val, y_pred_tfidf)
print('Multinomial Naive Bayes with TF-IDF:')
print('-' * 40)
print(f'f1: {f1:.4f}')
print(f'accuracy: {accuracy:.4f}')

Multinomial Naive Bayes with TF-IDF:
----------------------------------------
f1: 0.6779
accuracy: 0.7759


In [85]:
f1 = f1_score(y_val, y_pred_count, average='weighted')
accuracy = accuracy_score(y_val, y_pred_count)
print('Multinomial Naive Bayes with Word Count:')
print('-' * 40)
print(f'f1: {f1:.4f}')
print(f'accuracy: {accuracy:.4f}')

Multinomial Naive Bayes with Word Count:
----------------------------------------
f1: 0.7101
accuracy: 0.7860
