In [1]:
# 1–2: Import libraries and dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.metrics import accuracy_score


In [2]:
train_df = pd.read_csv("/home/user/Downloads/Ml_Lab/nlp_train.csv")
test_df = pd.read_csv("/home/user/Downloads/Ml_Lab/nlp_test.csv")
print(train_df)
print(test_df)

                                                    text  target  \
0      I was wondering if anyone out there could enli...       7   
1      A fair number of brave souls who upgraded thei...       4   
2      well folks, my mac plus finally gave up the gh...       4   
3      \nDo you have Weitek's address/phone number?  ...       1   
4      From article <C5owCB.n3p@world.std.com>, by to...      14   
...                                                  ...     ...   
11309  DN> From: nyeda@cnsvax.uwec.edu (David Nye)\nD...      13   
11310  I have a (very old) Mac 512k and a Mac Plus, b...       4   
11311  I just installed a DX2-66 CPU in a clone mothe...       3   
11312  \nWouldn't this require a hyper-sphere.  In 3-...       1   
11313  Stolen from Pasadena between 4:30 and 6:30 pm ...       8   

                       category  
0                     rec.autos  
1         comp.sys.mac.hardware  
2         comp.sys.mac.hardware  
3                 comp.graphics  
4            

In [3]:
# 3–5: Print all target labels
target_labels = train_df['category'].unique()
print("All target labels:", target_labels)


All target labels: ['rec.autos' 'comp.sys.mac.hardware' 'comp.graphics' 'sci.space'
 'talk.politics.guns' 'sci.med' 'comp.sys.ibm.pc.hardware'
 'comp.os.ms-windows.misc' 'rec.motorcycles' 'talk.religion.misc'
 'misc.forsale' 'alt.atheism' 'sci.electronics' 'comp.windows.x'
 'rec.sport.hockey' 'rec.sport.baseball' 'soc.religion.christian'
 'talk.politics.mideast' 'talk.politics.misc' 'sci.crypt']


In [4]:
# 6–8: Prepare subsets (alt.atheism, comp.graphics, sci.space)
categories = ['alt.atheism', 'comp.graphics', 'sci.space']
train_subset = train_df[train_df['category'].isin(categories)]
test_subset = test_df[test_df['category'].isin(categories)]


In [5]:
# 9: Print new training set target names (labels)
print("Training set labels:", train_subset['category'].unique())


Training set labels: ['comp.graphics' 'sci.space' 'alt.atheism']


In [10]:
#  10: Print training data of the 5th article
print("5th article in training data:\n", train_subset.iloc[4,0])


5th article in training data:
 
I certainly do use it whenever I have to do TIFF, and it usually works
very well.  That's not my point.  I'm >philosophically< opposed to it
because of its complexity.

This complexity has led to some programs' poor TIFF writers making
some very bizarre files, other programs' inability to load TIFF
images (though they'll save them, of course), and a general
inability to interchange images between different environments
despite the fact they all think they understand TIFF.

As the saying goes, "It's not me I'm worried about- it's all the
abuse of TIFF over the years, and I chalk it all up to the immense (and
unnecessary) complexity of the format.

In the words of the TIFF 5.0 spec, Appendix G, page G-1 (capitalized
emphasis mine):

"The only problem with this sort of success is that TIFF was designed
to be powerful and flexible, at the expense of simplicity.  It takes a
fair amount of effort to handle all the options currently defined in
this specificatio

In [35]:
# 11–12: Print shape and filenames (assuming filenames = index or preview)
print("Training data shape:", train_subset.shape)
print("Target data shape:", test_subset.shape)
print("Training set preview:\n", train_subset.head())
train_subset.index.values


Training data shape: (1657, 3)
Target data shape: (1102, 3)
Training set preview:
                                                  text  target       category
3   \nDo you have Weitek's address/phone number?  ...       1  comp.graphics
4   From article <C5owCB.n3p@world.std.com>, by to...      14      sci.space
13  \n   {Description of "External Tank" option fo...      14      sci.space
15  \nDon't be so sure.  Look what happened to Jap...       0    alt.atheism
16  \nI certainly do use it whenever I have to do ...       1  comp.graphics


array([    3,     4,    13, ..., 11286, 11289, 11312])

In [37]:
# 13: Use CountVectorizer to convert text to numeric
train_subset['text'] = train_subset['text'].fillna("")
test_subset['text'] = test_subset['text'].fillna("")
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(train_subset['text'])
y_train = train_subset['category']
X_train_counts

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_subset['text'] = train_subset['text'].fillna("")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_subset['text'] = test_subset['text'].fillna("")


<1657x23598 sparse matrix of type '<class 'numpy.int64'>'
	with 155497 stored elements in Compressed Sparse Row format>

In [9]:
# 14: Use BernoulliNB for training
bnb = BernoulliNB()
bnb.fit(X_train_counts, y_train)


In [19]:
X_train_counts

<1657x23598 sparse matrix of type '<class 'numpy.int64'>'
	with 155497 stored elements in Compressed Sparse Row format>

In [20]:
# 15: Convert test set with CountVectorizer
X_test_counts = vectorizer.transform(test_subset['text'])
y_test = test_subset['category']
X_test_counts

<1102x23598 sparse matrix of type '<class 'numpy.int64'>'
	with 105036 stored elements in Compressed Sparse Row format>

In [11]:
# 16–17: Predict and evaluate using BernoulliNB
y_pred_bnb = bnb.predict(X_test_counts)
bnb_accuracy = accuracy_score(y_test, y_pred_bnb)
print("Accuracy using BernoulliNB:", bnb_accuracy)


Accuracy using BernoulliNB: 0.6987295825771325


In [12]:
# 18: Use TfidfVectorizer + MultinomialNB
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(train_subset['text'])
X_test_tfidf = tfidf_vectorizer.transform(test_subset['text'])

mnb = MultinomialNB()
mnb.fit(X_train_tfidf, y_train)
y_pred_mnb = mnb.predict(X_test_tfidf)


In [13]:
# 19: Evaluate accuracy for MultinomialNB
mnb_accuracy = accuracy_score(y_test, y_pred_mnb)
print("Accuracy using Tfidf + MultinomialNB:", mnb_accuracy)


Accuracy using Tfidf + MultinomialNB: 0.8620689655172413


In [14]:
# 20: Try avoiding stopwords and repeat
tfidf_vectorizer_sw = TfidfVectorizer(stop_words='english')
X_train_sw = tfidf_vectorizer_sw.fit_transform(train_subset['text'])
X_test_sw = tfidf_vectorizer_sw.transform(test_subset['text'])

mnb_sw = MultinomialNB()
mnb_sw.fit(X_train_sw, y_train)
y_pred_sw = mnb_sw.predict(X_test_sw)

accuracy_sw = accuracy_score(y_test, y_pred_sw)
print("Accuracy using Tfidf + stopwords removed:", accuracy_sw)


Accuracy using Tfidf + stopwords removed: 0.8729582577132486
