# **Databases for sentiment analysis models **

In [None]:
import os
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
import zipfile
import os


# Unzip the file
zip_file_name = 'Customer_Reviews.zip'
with zipfile.ZipFile(zip_file_name, 'r') as zip_ref:
    zip_ref.extractall('Customer_reviews')  # Extract all files

print("Images unzipped to the 'Customer_reviews' folder.")

Images unzipped to the 'Customer_reviews' folder.


In [None]:
import os

def create_output_folder(output_folder):
    # Check if output folder exists, if not, create it
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
        print(f"Output folder created: {output_folder}")
    else:
        print(f"Output folder already exists: {output_folder}")

# Define the output folder path
output_folder = '/content/processed_customer_reviews'

# Create the output folder
create_output_folder(output_folder)

Output folder created: /content/processed_customer_reviews


In [None]:
#Data preprocessing
import os
import string
from nltk.tokenize import word_tokenize

def process_all_files(input_folder, output_folder):
    # Calculate the number of digits for zero-padding
    total_files = len([f for f in os.listdir(input_folder) if f.endswith(".txt")])
    num_digits = len(str(total_files))

    # Collect examples for the first five files
    examples = []

    for i, filename in enumerate(sorted(os.listdir(input_folder)), 1):
        if filename.endswith(".txt"):
            file_path = os.path.join(input_folder, filename)

            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()

            # Tokenize and convert to lowercase
            tokens = [word for word in word_tokenize(text.lower())]

            # Collect examples for the first five files
            if i <= 5:
                examples.append({
                    "file_name": filename,
                    "lowercase_text": text.lower(),
                    "tokens": tokens
                })

            # processed tokens to output file
            output_file_path = os.path.join(output_folder, f"{i:0>{num_digits}}_pre_processed_customer_reviews.txt")
            with open(output_file_path, 'w', encoding='utf-8') as output_file:
                for token in tokens:
                    output_file.write(token + "\n")

    # Print examples for the first five files
    print("\n--- Sample Outputs (First 5 Files) ---")
    for i, example in enumerate(examples):
        print(f"\nFile: {example['file_name']}")
        print(f"Lowercased Text:\n{example['lowercase_text'][:100]}...")
        print(f"Tokens:\n{example['tokens'][:10]}...")

    print("\nProcessed files and saved")

#Input and output folder paths
input_folder = '/content/Customer_reviews/Customer_Reviews'
output_folder = '/content/processed_customer_reviews'

# Ensure the output folder exists
os.makedirs(output_folder, exist_ok=True)

# Process all files
process_all_files(input_folder, output_folder)



--- Sample Outputs (First 5 Files) ---

File: review_1.txt
Lowercased Text:
though we talk a lot about high price for iphones, this is worth the amount. smooth touch, perfect t...
Tokens:
['though', 'we', 'talk', 'a', 'lot', 'about', 'high', 'price', 'for', 'iphones']...

File: review_10.txt
Lowercased Text:

the iphone 13 128gb has surpassed my expectations in every aspect, setting a new standard for smart...
Tokens:
['the', 'iphone', '13', '128gb', 'has', 'surpassed', 'my', 'expectations', 'in', 'every']...

File: review_11.txt
Lowercased Text:
i snagged the iphone 13 during the great indian festival for an unbeatable price of 45,000, and let ...
Tokens:
['i', 'snagged', 'the', 'iphone', '13', 'during', 'the', 'great', 'indian', 'festival']...

File: review_12.txt
Lowercased Text:
adaptor supplied with it was not working, when taken to authorized service center they said the adap...
Tokens:
['adaptor', 'supplied', 'with', 'it', 'was', 'not', 'working', ',', 'when', 'taken']...

File

In [None]:
import os
from sklearn.feature_extraction.text import CountVectorizer

def bag_of_words_vectorization(input_folder):
    # Load all the preprocessed text files
    documents = []  # List to store all text content
    for filename in sorted(os.listdir(input_folder)):  # Process files in order
        if filename.endswith(".txt"):  # Only process text files
            file_path = os.path.join(input_folder, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                file_content = file.read().strip()  # strip any extra whitespace
                if file_content:  #To check if the file is not empty
                    documents.append(file_content)  # Add content to the list

    # Bag of Words (BoW) Vectorization
    bow_vectorizer = CountVectorizer()
    bow_vectorizer.fit(documents)  # Fit to the documents to learn vocabulary

    # Transform documents to create the BoW matrix
    bow_matrix = bow_vectorizer.transform(documents)

    # Print the shape of the BoW matrix
    print(f"BoW Matrix Shape: {bow_matrix.shape}")

    total_words = len(bow_vectorizer.vocabulary_)
    print(f"Total number of unique words in the vocabulary: {total_words}")

    # vocabulary words and their corresponding indices
    print("\nVocabulary (All words and their indices):")
    for word, index in bow_vectorizer.vocabulary_.items():
        print(f"{word}: {index}")

#folder containing preprocessed text files
input_folder = '/content/processed_customer_reviews'  # Adjust the path as necessary

# Perform Bag of Words vectorization
bag_of_words_vectorization(input_folder)


BoW Matrix Shape: (53, 1111)
Total number of unique words in the vocabulary: 1111

Vocabulary (All words and their indices):
though: 994
we: 1067
talk: 969
lot: 594
about: 44
high: 476
price: 775
for: 408
iphones: 535
this: 993
is: 537
worth: 1098
the: 983
amount: 81
smooth: 909
touch: 1009
perfect: 734
to: 1001
hold: 479
in: 509
hand: 450
screen: 860
size: 898
of: 685
it: 541
looks: 592
small: 905
overall: 712
could: 238
have: 458
gone: 439
pro: 778
max: 617
but: 167
more: 645
than: 978
5l: 27
budget: 164
way: 1065
too: 1005
much: 649
me: 619
good: 440
battery: 126
back: 120
up: 1037
with: 1090
low: 600
mode: 637
always: 75
on: 691
hours: 487
time: 997
around: 103
twitter: 1024
and: 83
whatsapp: 1075
usage: 1041
hour: 486
songs: 921
listening: 583
rummy: 848
candy: 179
crush: 249
games: 426
notification: 678
all: 68
these: 988
items: 543
comes: 213
24: 19
getting: 431
charged: 194
quickly: 799
from: 414
20: 14
100: 2
within: 1091
camera: 176
love: 596
that: 981
live: 585
photo: 742
op

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import os

def tfidf_vectorization(input_folder):
    # Loading preprocessed text files
    documents = []  #  store all text to a list
    file_names = []  #  to keep file names in a list
    for filename in sorted(os.listdir(input_folder)):  # files in order
        if filename.endswith(".txt"):  # process text files only
            file_path = os.path.join(input_folder, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                documents.append(file.read())  # adding the content to a list using the append
                file_names.append(filename)  # Track the file name

    # TF-IDF Vectorization
    print("\n--- Term Frequency-Inverse Document Frequency (TF-IDF) Representation ---")
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(documents)  # Transform text into numerics
    print(f"TF-IDF Matrix Shape: {tfidf_matrix.shape}")

    # feature names and their corresponding indices
    feature_names = tfidf_vectorizer.get_feature_names_out()

    # vocabulary and corresponding term frequencies
    vocabulary = tfidf_vectorizer.vocabulary_
    terms = list(vocabulary.keys())  # Get the words)

    # Output term frequencies
    formatted_words = [f"{word:<15}: {tfidf_matrix[:, vocabulary[word]].sum():>5}" for word in terms]

    # Print the formatted list of terms with their total term frequencies//
    for formatted_word in formatted_words:
        print(formatted_word)

    return tfidf_matrix, tfidf_vectorizer

# Define the folder containing preprocessed text files
input_folder = '/content/processed_customer_reviews'

# TF-IDF vectorization
tfidf_matrix, tfidf_vectorizer = tfidf_vectorization(input_folder)



--- Term Frequency-Inverse Document Frequency (TF-IDF) Representation ---
TF-IDF Matrix Shape: (53, 1111)
though         : 0.4644784181233159
we             : 0.6612975268155505
talk           : 0.20211628465330342
lot            : 0.34124783070135156
about          : 0.6600979310752724
high           : 0.21956495632287332
price          : 1.1421831814136445
for            : 2.128051750908266
iphones        : 0.08447773262036845
this           : 2.242009813517413
is             : 4.774134796618483
worth          : 0.6790963509558053
the            : 6.651748168363267
amount         : 0.08447773262036845
smooth         : 0.689388988455192
touch          : 0.3672537450607284
perfect        : 0.6259036473030073
to             : 3.3380164950150455
hold           : 0.22798729919375849
in             : 2.345814711591991
hand           : 0.2736215664646102
screen         : 0.6565530175389214
size           : 0.1689554652407369
of             : 2.66152897961289
it             : 4.717119786626

In [None]:
import os
import json
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer

metadata = {
    "vectorization":{
      "methods":[
          "Bag of Words(BoW)",
          "Term Frequency-Inverse Document Frequency(TF-IDF)"
      ],

    },
    "document":[]
}

def create_metadata(input_folder, output_metadata_file):
    documents = []  # List to store text content

    # Process text files
    text_files = sorted([file for file in os.listdir(input_folder) if file.endswith('.txt')])
    for file_name in text_files:
        file_path = os.path.join(input_folder, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read().strip()

            if text:  # Check if the file has content
                # Sentiment analysis using TextBlob
                blob = TextBlob(text)
                sentiment = "positive" if blob.sentiment.polarity > 0 else \
                            "negative" if blob.sentiment.polarity < 0 else "neutral"

                # Add document metadata
                metadata["document"].append({
                    "file_name": file_name,
                    "sentiment": sentiment,
                    "text_content": text  # Temporarily store text for vectorization
                })
                documents.append(text)

    # Vectorization using CountVectorizer (Bag of Words)
    if documents:  # Ensure there are documents to vectorize
        vectorizer = CountVectorizer()
        bow_matrix = vectorizer.fit_transform(documents)
        vocab = vectorizer.get_feature_names_out()

        # Update metadata with top words
        for i, doc in enumerate(bow_matrix):
            word_counts = doc.toarray().flatten()
            top_word_indices = word_counts.argsort()[-5:][::-1]  # Top 5 words
            top_words = [vocab[idx] for idx in top_word_indices if word_counts[idx] > 0]
            metadata["document"][i]["top_words"] = top_words

            # Remove text_content after processing
            del metadata["document"][i]["text_content"]

    # Save metadata to a JSON file
    with open(output_metadata_file, 'w', encoding='utf-8') as json_file:
        json.dump(metadata, json_file, indent=4)

    print(f"Metadata saved to {output_metadata_file}")

# Define the folder containing preprocessed text files
input_folder = '/content/processed_customer_reviews'
output_metadata_file = 'text_metadata.json'

# Generate metadata
create_metadata(input_folder, output_metadata_file)



Metadata saved to text_metadata.json


In [None]:
!pip install "pymongo[srv]"


Collecting pymongo[srv]
  Downloading pymongo-4.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (22 kB)
[0mCollecting dnspython<3.0.0,>=1.16.0 (from pymongo[srv])
  Downloading dnspython-2.7.0-py3-none-any.whl.metadata (5.8 kB)
Downloading dnspython-2.7.0-py3-none-any.whl (313 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m313.6/313.6 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pymongo-4.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dnspython, pymongo
Successfully installed dnspython-2.7.0 pymongo-4.10.1


In [None]:
import pymongo
from pymongo import MongoClient

In [None]:
text_connection = pymongo.MongoClient('mongodb+srv://shahma20221560:sasy2002@cluster0.k4kkt.mongodb.net/')

db = text_connection["Text_Database"]

In [None]:
# Testing the connection by listing collections
try:
    print(db.list_collection_names())
except Exception as e:
    print(f"Connection failed: {e}")

['processed_text', 'sentiment_labels']


In [None]:
import os
text_collection = db["processed_text"]

# Path to the folder
preprocessed_folder = '/content/processed_customer_reviews'

# Loop through each file in the folder
for filename in os.listdir(preprocessed_folder):
    if filename.endswith(".txt"):  # Process only .txt files
        file_path = os.path.join(preprocessed_folder, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            preprocessed_text = file.read()

            # Document to upload
            document = {
                "filename": filename,
                "preprocessed_text": preprocessed_text
            }

            # Upload to MongoDB (replace if filename exists, otherwise insert)
            text_collection.replace_one(
                {"filename": filename},
                document,
                upsert=True
            )

        print(f"Uploaded {filename} to MongoDB.")  # Properly aligned print statement

print("All preprocessed files have been uploaded to MongoDB.")  # Fixed indentation


Uploaded 24_pre_processed_customer_reviews.txt to MongoDB.
Uploaded 06_pre_processed_customer_reviews.txt to MongoDB.
Uploaded 41_pre_processed_customer_reviews.txt to MongoDB.
Uploaded 11_pre_processed_customer_reviews.txt to MongoDB.
Uploaded 20_pre_processed_customer_reviews.txt to MongoDB.
Uploaded 35_pre_processed_customer_reviews.txt to MongoDB.
Uploaded 36_pre_processed_customer_reviews.txt to MongoDB.
Uploaded 40_pre_processed_customer_reviews.txt to MongoDB.
Uploaded 48_pre_processed_customer_reviews.txt to MongoDB.
Uploaded 38_pre_processed_customer_reviews.txt to MongoDB.
Uploaded 01_pre_processed_customer_reviews.txt to MongoDB.
Uploaded 16_pre_processed_customer_reviews.txt to MongoDB.
Uploaded 17_pre_processed_customer_reviews.txt to MongoDB.
Uploaded 21_pre_processed_customer_reviews.txt to MongoDB.
Uploaded 05_pre_processed_customer_reviews.txt to MongoDB.
Uploaded 07_pre_processed_customer_reviews.txt to MongoDB.
Uploaded 26_pre_processed_customer_reviews.txt to MongoD

In [None]:
import json
from pymongo import MongoClient

sentiment_collection = db["sentiment_labels"]


# Path to the sentiment metadata file
sentiment_metadata_path = '/content/text_metadata.json'

# Load and debug the JSON file
with open(sentiment_metadata_path, 'r', encoding='utf-8') as file:
    sentiment_metadata = json.load(file)  # Ensure it loads as a list of dictionaries

for filename, sentiment in sentiment_metadata.items():
    existing_document = sentiment_collection.find_one({"filename": filename})

    if existing_document is None:
        # Insert new metadata if the document does not exist
        sentiment_collection.insert_one({
            "filename": filename,
            "sentiment": sentiment
        })
        print(f"Inserted new sentiment data for '{filename}' into MongoDB.")
    else:
        # Update existing document if it exists
        sentiment_collection.update_one(
            {"filename": filename},
            {"$set": {"sentiment": sentiment}}
        )
        print(f"Updated sentiment data for existing '{filename}' in MongoDB.")

print("Sentiment data update process completed successfully.")



Inserted new sentiment data for 'vectorization' into MongoDB.
Inserted new sentiment data for 'document' into MongoDB.
Sentiment data update process completed successfully.


In [None]:
filename_to_search = "04_pre_processed_customer_reviews.txt"  # Specify the filename to search

# Query to search in the 'Reviews' collection
query = {"filename": filename_to_search}

# Retrieve the document from the 'Reviews' collection
document = db.processed_text.find_one(query)

if document:
    # Display the review text in the Colab cell
    print(f"Filename: {document['filename']}")
    print(f"Review Text:\n{document['preprocessed_text']}")  # Display the customer review text
else:
    print("No document found with the specified filename.")


Filename: 04_pre_processed_customer_reviews.txt
Review Text:
adaptor
supplied
with
it
was
not
working
,
when
taken
to
authorized
service
center
they
said
the
adaptor
was
fake
.
do
not
buy
adaptor
with
it
.

