# Text Feature Extraction Documentation

## Author: 
ShrugalTayal (shrugal20408@iiitd.ac.in)

## Introduction
This Jupyter Notebook contains code for performing text feature extraction using techniques such as lower-casing, tokenization, punctuation removal, stop word removal, stemming, lemmatization, and TF-IDF calculation.

In [1]:
import pickle
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import pandas as pd
import os
import math

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Data Loading
- Load the data from the CSV file containing text reviews.

In [2]:
# Read data from CSV file
data = pd.read_csv(r'C:\Users\HP\Documents\Shrugal IIITD\Semester 8\Information Retrieval\CSE508_Winter2024_A2_2020408\res\A2_Data.csv')

In [148]:
# Handle missing values
data['Review Text'] = data['Review Text'].fillna('')  # Replace NaN with empty string

In [149]:
# Extract text reviews from the data
text_reviews = data['Review Text'].tolist()

In [150]:
print('text_reviews', text_reviews)



## Text Preprocessing
- Apply various text preprocessing techniques to clean and prepare the text data for analysis.
- Techniques include lower-casing, tokenization, punctuation removal, stop word removal, stemming, and lemmatization.

In [151]:
# Preprocessing techniques
def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    # Tokenization
    tokens = nltk.word_tokenize(text)
    # Removing punctuation
    tokens = [re.sub(r'[^\w\s]', '', token) for token in tokens]
    # Stopword removal
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

In [152]:
# Apply preprocessing to text reviews
preprocessed_reviews = [preprocess_text(review) for review in text_reviews]

In [153]:
print('preprocessed_reviews', preprocessed_reviews)

preprocessed_reviews ['love vintag spring vintag strat  good tension great stabil  float bridg want spring way go ', 'work great guitar bench mat  rug enough abus take care  take care  make organ workspac much easier screw wo nt roll around  color good ', 'use everyth acoust bass ukulel  know smaller model avail uke  violin  etc   nt yet order  work smaller instrument one nt extend foot maximum width  gentl instrument  grippi materi keep secur  greatest benefit write music comput need set guitar use keyboardmous  easier hang stand  sever gave one friend christma well  use mine stage  fold small enough fit right gig bag ', 'great price good qualiti  nt quit match radiu sound hole close enough ', 'bought bass split time primari bass dean edg  might win  bass boost outstand  activ pickup realli allow adjust sound want  recommend anyon  beginn like long ago  excel bass start  tour andor music make money  bass beati stage  color bit darker pictur   around  great buy ', 'toy side instrument 

## TF-IDF Calculation
- Calculate Term Frequency (TF) and Inverse Document Frequency (IDF) for each term in the text data.
- Compute TF-IDF scores for each term-document pair.

In [154]:
# Calculate Term Frequency (TF)
def calculate_tf(text):
    tf_dict = {}
    text = text.split(' ')
    total_words = len(text)
    for word in text:
        if word in tf_dict:
            tf_dict[word] += 1 / total_words
        else:
            tf_dict[word] = 1 / total_words
    return tf_dict

# Calculate Inverse Document Frequency (IDF)
def calculate_idf(text_list):
    idf_dict = {}
    total_documents = len(text_list)
    for text in text_list:
        text = text.split(' ')
        words = set(text)
        for word in words:
            if word in idf_dict:
                idf_dict[word] += 1
            else:
                idf_dict[word] = 1
    
    for word, count in idf_dict.items():
        idf_dict[word] = math.log10(total_documents / count)
    
    return idf_dict

# Calculate TF-IDF scores
def calculate_tfidf(text_list):
    tfidf_list = []
    idf_dict = calculate_idf(text_list)
    for text in text_list:
        tfidf = {}
        tf_dict = calculate_tf(text)
        for word, tf in tf_dict.items():
            tfidf[word] = tf * idf_dict[word]
        tfidf_list.append(tfidf)
    return tfidf_list

In [155]:
# Calculate TF-IDF scores
tfidf_scores = calculate_tfidf(preprocessed_reviews)

In [156]:
print('tfidf_scores', tfidf_scores)

tfidf_scores [{'love': 0.04997941416013539, 'vintag': 0.1938586105440771, 'spring': 0.239433551109527, 'strat': 0.0753637401952118, '': 0.001981193288608014, 'good': 0.03811848775726925, 'tension': 0.11649500072266979, 'great': 0.027491667871116333, 'stabil': 0.11649500072266979, 'float': 0.14015993029335208, 'bridg': 0.0815845046087636, 'want': 0.05197455615406008, 'way': 0.063148729993308, 'go': 0.05056082714225567}, {'work': 0.02194441448647778, 'great': 0.017063793851037724, 'guitar': 0.018903916016403784, 'bench': 0.09306793118400064, 'mat': 0.09306793118400064, '': 0.0020495102985600144, 'rug': 0.07054336174347156, 'enough': 0.038012858921019264, 'abus': 0.08699581880277026, 'take': 0.08280410003768401, 'care': 0.10404402111487775, 'make': 0.0305536775066608, 'organ': 0.07934586191944763, 'workspac': 0.10344827586206896, 'much': 0.032389729642978905, 'easier': 0.05858517256331099, 'screw': 0.03982420551674977, 'wo': 0.05063865803302568, 'nt': 0.016288624107884873, 'roll': 0.07230

## Save Results
- Save the preprocessed text and TF-IDF scores using the pickle module.

In [157]:
# Destination path for saving dumps
destination_path = r'C:\Users\HP\Documents\Shrugal IIITD\Semester 8\Information Retrieval\CSE508_Winter2024_A2_2020408\dumps\text_feature_extraction_dumps'

# Save preprocessed text and TF-IDF scores
preprocessed_text_path = os.path.join(destination_path, 'preprocessed_text.pkl')
tfidf_scores_path = os.path.join(destination_path, 'tfidf_scores.pkl')

with open(preprocessed_text_path, 'wb') as f:
    pickle.dump(preprocessed_reviews, f)

with open(tfidf_scores_path, 'wb') as f:
    pickle.dump(tfidf_scores, f)