In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import os
# importing the train dataset

# Path to the train dataset folder
dataset_path = "C:\\Users\\HP\\OneDrive\\NLP PROJECT\\SENTIMENT_ANALYSIS\\train"

# Initialize lists to store data
texts = []
labels = []

# Read positive text files
positive_path = os.path.join(dataset_path, 'pos')
for filename in os.listdir(positive_path):
    if filename.endswith('.txt'):
        with open(os.path.join(positive_path, filename), 'r', encoding='utf-8') as file:
            texts.append(file.read())
            labels.append('positive')

# Read negative text files
negative_path = os.path.join(dataset_path, 'neg')
for filename in os.listdir(negative_path):
    if filename.endswith('.txt'):
        with open(os.path.join(negative_path, filename), 'r', encoding='utf-8') as file:
            texts.append(file.read())
            labels.append('negative')

# Create a DataFrame
train_df = pd.DataFrame({
    'text': texts,
    'label': labels
})

# Display the first few rows
print(train_df.head())


                                                text     label
0  Bromwell High is a cartoon comedy. It ran at t...  positive
1  Homelessness (or Houselessness as George Carli...  positive
2  Brilliant over-acting by Lesley Ann Warren. Be...  positive
3  This is easily the most underrated film inn th...  positive
4  This is not the typical Mel Brooks film. It wa...  positive


In [3]:
# outputs the shape of the train dataset
train_df.shape

(25000, 2)

In [4]:
# checking the distribution of the train label

train_df["label"].value_counts()

label
positive    12500
negative    12500
Name: count, dtype: int64

In [5]:
train_df.isna().sum()

text     0
label    0
dtype: int64

In [6]:
# preprocessing the train text by tokenizing, removing stop words and also reducing the txt to their base word
import spacy

nlp = spacy.load("en_core_web_sm")

def preprocess(txt):
    doc = nlp(txt)
    filtered_token = []
    for token in doc:
        if not token.is_stop and not token.is_punct:
            filtered_token.append(token.lemma_.lower())
    return " ".join(filtered_token)
    

In [7]:
train_df["text_preprocesed"] = train_df["text"].apply(preprocess)

In [8]:
# comparing the text and the preprocessed and noting the difference
train_df["text"][0]

'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!'

In [9]:
train_df["text_preprocesed"][0]

'bromwell high cartoon comedy run time program school life teachers 35 year teaching profession lead believe bromwell high satire close reality teachers scramble survive financially insightful student right pathetic teacher pomp pettiness situation remind school know student see episode student repeatedly try burn school immediately recall high classic line inspector sack teacher student welcome bromwell high expect adult age think bromwell high far fetch pity'

In [10]:
# vectorizing the training label also

train_df["label_preprocess"] = train_df["label"].map({"positive": 0, "negative": 1})

In [11]:
train_df.head()

Unnamed: 0,text,label,text_preprocesed,label_preprocess
0,Bromwell High is a cartoon comedy. It ran at t...,positive,bromwell high cartoon comedy run time program ...,0
1,Homelessness (or Houselessness as George Carli...,positive,homelessness houselessness george carlin state...,0
2,Brilliant over-acting by Lesley Ann Warren. Be...,positive,brilliant act lesley ann warren well dramatic ...,0
3,This is easily the most underrated film inn th...,positive,easily underrated film inn brooks cannon sure ...,0
4,This is not the typical Mel Brooks film. It wa...,positive,typical mel brooks film slapstick movie actual...,0


In [12]:
# importing the test dataset
import os

# Path to the dataset folder
dataset_path = "C:\\Users\\HP\\OneDrive\\NLP PROJECT\\SENTIMENT_ANALYSIS\\test"

# Initialize lists to store data
texts = []
labels = []

# Read positive text files
positive_path = os.path.join(dataset_path, 'pos')
for filename in os.listdir(positive_path):
    if filename.endswith('.txt'):
        with open(os.path.join(positive_path, filename), 'r', encoding='utf-8') as file:
            texts.append(file.read())
            labels.append('positive')

# Read negative text files
negative_path = os.path.join(dataset_path, 'neg')
for filename in os.listdir(negative_path):
    if filename.endswith('.txt'):
        with open(os.path.join(negative_path, filename), 'r', encoding='utf-8') as file:
            texts.append(file.read())
            labels.append('negative')

# Create a DataFrame
test_df = pd.DataFrame({
    'text': texts,
    'label': labels
})

# Display the first few rows of the test data
print(test_df.head())


                                                text     label
0  I went and saw this movie last night after bei...  positive
1  Actor turned director Bill Paxton follows up h...  positive
2  As a recreational golfer with some knowledge o...  positive
3  I saw this film in a sneak preview, and it is ...  positive
4  Bill Paxton has taken the true story of the 19...  positive


In [13]:
test_df["text_preprocesed"] = test_df["text"].apply(preprocess)

In [14]:
test_df["label_preprocess"] = test_df["label"].map({"positive": 0, "negative": 1})

In [15]:
test_df.head()

Unnamed: 0,text,label,text_preprocesed,label_preprocess
0,I went and saw this movie last night after bei...,positive,go see movie night coax friend admit reluctant...,0
1,Actor turned director Bill Paxton follows up h...,positive,actor turn director bill paxton follow promisi...,0
2,As a recreational golfer with some knowledge o...,positive,recreational golfer knowledge sport history pl...,0
3,"I saw this film in a sneak preview, and it is ...",positive,see film sneak preview delightful cinematograp...,0
4,Bill Paxton has taken the true story of the 19...,positive,bill paxton take true story 1913 golf open fil...,0


In [16]:
# vectorizing the text by converting them to vectors using TF-IDF
# Training the model using naive bayes
# streamling both processess using pipeline

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer



X_train = train_df["text_preprocesed"]
y_train = train_df["label_preprocess"]

X_test = test_df["text_preprocesed"]
y_test = test_df["label_preprocess"]


model = Pipeline([
    ("Vectorizer", TfidfVectorizer()),  # Vectorize preprocessed text
    ("MultinomialNB", MultinomialNB())  # Train Naive Bayes classifier
])


model.fit(X_train, y_train)

In [17]:
# feed the unseen data into the model and check for the accuracy
y_pred = model.predict(X_test)

score = accuracy_score(y_test, y_pred)

score

0.8238

In [18]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.86      0.77      0.81     12500
           1       0.79      0.87      0.83     12500

    accuracy                           0.82     25000
   macro avg       0.83      0.82      0.82     25000
weighted avg       0.83      0.82      0.82     25000



In [19]:
# using another algorithm to compare 
from sklearn.linear_model import LogisticRegression


LR_model = Pipeline([
    ("Vectorizer", TfidfVectorizer()),
    ("MultinomialNB", LogisticRegression())
])


LR_model.fit(X_train, y_train)

In [20]:
# feed the unseen data into the logistics regression model and check for the accuracy
y_pred = LR_model.predict(X_test)

score = accuracy_score(y_test, y_pred)

score

0.87504

In [21]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.87      0.88      0.88     12500
           1       0.88      0.87      0.87     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000



The logistics regression performed better than the naive bayes algorithm

In [22]:
# save the Logistic regression model 
import pickle

pickle.dump(LR_model, open('my_model.pkl', 'wb'))