In [4]:
import numpy as np
import pandas as pd
from sklearn import naive_bayes
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn import metrics
from sklearn.metrics import classification_report

## Read data 

In [5]:
corpus = []
labels = []

name = ['imdb_labelled', 'amazon_cells_labelled','yelp_labelled']
for i in name :
    files = open(f'{i}.txt', "r")
    
    data = files.readlines()
   
    for d in data:
        d = d.split("\t")
        corpus.append(d[0])
        labels.append(d[1].replace("\n", ""))
        files.close()

        #labels, corpus

In [6]:
len(corpus)

3000

In [7]:
#print(labels)

In [8]:
#Preprocessing the corpus
X = []
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
for user_text in corpus:
    # Remove puntuations and numbers
    user_text = re.sub('[^a-zA-Z]', ' ', user_text)
    # Remove single characters
    user_text = re.sub(r"\s+[a-zA-Z]\s+", ' ', user_text)

    # remove multiple spaces
    user_text = re.sub(r'\s+', ' ', user_text)
    user_text = user_text.lower()
    # spliting text
    # Remove unncecessay stopwords
    user_text = [word for word in user_text.split() if not word in stopwords.words("english")]
    
    newString=''
    for i in user_text:                                                 
        newString=newString + stemmer.stem(i)+' '    #converting words to lemma
        
    
    X.append(newString.strip())
#create dataFrame
df_data = pd.DataFrame(columns=['Phrase','label'])      
for data, label in zip(X, labels):
    df_data.loc[len(df_data.index)] = [data,label]

In [9]:
df_data

Unnamed: 0,Phrase,label
0,slow move aimless movi distress drift young man,0
1,sure lost flat charact audienc near half walk,0
2,attempt arti black white clever camera angl mo...,0
3,littl music anyth speak,0
4,best scene movi gerardo tri find song keep run...,1
...,...,...
2995,think food flavor textur lack,0
2996,appetit instant gone,0
2997,overal impress would go back,0
2998,whole experi underwhelm think go ninja sushi n...,0


In [10]:
#GENERATE COUNT FEATURES AS VECTORS
def count_vectorize(X_train, X_test):
                count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
                count_vect.fit(X_train)

                # transform the training and validation data using count vectorizer object
                xtrain_count = count_vect.transform(X_train)
                xvalid_count = count_vect.transform(X_test)

                return xtrain_count,xvalid_count


In [14]:
 def word_TF_IDF_vectorize( X_train, X_test):
                tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=10000)
                tfidf_vect.fit(X_train)

                xtrain_tfidf = tfidf_vect.transform(X_train)
                xvalid_tfidf = tfidf_vect.transform(X_test)

                return xtrain_tfidf, xvalid_tfidf

In [15]:
#Splitting into training(80%) and testing data(20%)
X_train, X_test, y_train, y_test = train_test_split(df_data['Phrase'], df_data['label'], test_size=0.20, random_state=42, shuffle=True)

In [16]:
xtrain_count, xvalid_count = count_vectorize(X_train, X_test)
xtrain_tfidf, xvalid_tfidf = word_TF_IDF_vectorize(X_train, X_test)

In [17]:
def train_ml_model(classifier, X_train, X_test, y_train, y_test):
    # fit the training dataset on the classifier
    classifier.fit(X_train, y_train)
    # predict the labels on validation dataset
    predictions = classifier.predict(X_test)
    return classifier, metrics.classification_report(predictions, y_test)

In [21]:
# Naive Bayes on Count Vectors
NB_cv, accuracy = train_ml_model(naive_bayes.MultinomialNB(), xtrain_count, xvalid_count, y_train, y_test)
print("Naive Bayes, Count Vectors:\n ", accuracy)


Naive Bayes, Count Vectors:
                precision    recall  f1-score   support

           0       0.80      0.79      0.79       299
           1       0.79      0.81      0.80       301

    accuracy                           0.80       600
   macro avg       0.80      0.80      0.80       600
weighted avg       0.80      0.80      0.80       600



In [20]:
# Naive Bayes on Word Level TF IDF Vectors
NB_word_tf_idf, accuracy = train_ml_model(naive_bayes.MultinomialNB(), xtrain_tfidf, xvalid_tfidf, y_train, y_test)
print("Naive Bayes, WordLevel TF-IDF:\n ", accuracy)

Naive Bayes, WordLevel TF-IDF:
                precision    recall  f1-score   support

           0       0.80      0.79      0.79       296
           1       0.79      0.80      0.80       304

    accuracy                           0.80       600
   macro avg       0.80      0.79      0.79       600
weighted avg       0.80      0.80      0.79       600

