# Assignment 2
## Roll Number : CS25MTECH02007
## Name : Rajat Maheshwari
<hr>

# Text Cleaning, and Model Building For Given DataSets

## 📝 Assignment Overview
In this assignment, I will:
1. **Implement** 4 different NLP classification models
2. **Clean and harmonize**  Given Data for model building
3. Perform **text-based Classification** on the cleaned data to extract insights.

# Install Modules

In [83]:
! pip install -r requirements.txt




[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


# Necessary Imports

In [84]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np
import re
import urllib
import unicodedata
import urllib.parse
import logging
import nltk

## NLTK Downloads

In [85]:
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\rajat\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rajat\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rajat\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\rajat\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rajat\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\rajat\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       d

True

## NLTK Imports

In [86]:
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

## Text Cleaner Class

In [None]:
class TextCleaner:
    def __init__(self):
        # Initialize our regex detectives - these patterns help sniff out specfic data types
        self.json_pattern = re.compile(r'\{[^}]+\}', re.DOTALL)  # Catches JSON blobs like {key:val}
        self.math_pattern = re.compile(r'\$.*?\$', re.DOTALL)     # Finds mathy stuff between $$
        self.control_chars = re.compile(r"[\x00-\x09\x0B\x0C\x0E-\x1F\x7F-\x9F]")  # Weird control chars
        self.extra_newlines = re.compile(r'\n{9,}')  
        self.non_word_chars = re.compile(r"[^\w\s]")  # Non-word chars except spaces
        self.multi_space = re.compile(r" +")  # Squash multiple spacces

        # Set up NLP tools - lemmatizer for word roots, stopwords for common junk
        self.lemmatizer = WordNetLemmatizer()  # Makes words their base form (better -> good)
        self.stop_words = set(stopwords.words('english'))  # Common words like 'the', 'and'

    def _get_wordnet_pos(self, treebank_tag):
        # Helper to convert POS tags to wordnet format. Treebank tags are confusing TBH
        return {
            'J': wordnet.ADJ,  # Adjectives
            'V': wordnet.VERB,  # Verbs
            'R': wordnet.ADV  # Adverbs
        }.get(treebank_tag[0], wordnet.NOUN)  # Default to noun if no match

    def lemmatize_text(self, text, use_pos=True):
        """Turns words to their dictionary form. POS tagging helps accuracy but costs speed."""
        try:
            tokens = word_tokenize(text)  # Split text into words
            
            if use_pos:  
                pos_tags = nltk.pos_tag(tokens)  # Get grammar tags
                return ' '.join([
                    self.lemmatizer.lemmatize(word, self._get_wordnet_pos(tag))
                    for word, tag in pos_tags  # Process each word with its tag
                ])
            else:
                return ' '.join([self.lemmatizer.lemmatize(word) for word in tokens])
        except Exception as e:
            logging.error(f"Lemmatization oopsie: {str(e)}")  # Log but don't crash
            return text  # Return original if things go south

    def remove_numbers(self, text):
        """Kicks out lonely numbers but keeps words with numbers (like B2B)"""
        return re.sub(r'\b\d+\b', '', text)  # \b means word boundary

    def clean_text(self, text):
        """Main scrubber - handles encoding weirdness and special characters"""
        try:
            text = self.control_chars.sub(" ", text)  # Remove control chars
            text = unicodedata.normalize("NFKD", text)  # Standardize fancy unicode
            text = text.replace("\u2022", "\n- ").replace("\xa0", " ")  # Bullets to dashes
            text = self.non_word_chars.sub(" ", text)  # Replace punctuation
            text = text.encode("ascii", "ignore").decode("utf-8")  # Force ASCII
            text = self.extra_newlines.sub("\n\n", text)  # Limit excessive newlines
            return self.multi_space.sub(" ", text).strip()  # Clean up spaces
        except Exception as e:
            logging.error(f"Cleaner tripped up: {str(e)}")  # Non-fatal error
            return text
        
    def clean_ratings(self, text):
        """Swaps number+letter counts like 12M → COUNT. Handy for review data"""
        return re.sub(r'\b\d+[MK]\b', 'COUNT', text, flags=re.IGNORECASE) 
    
    def clean_durations(self, text):
        """Converts time formats like 2h 30m → Duration"""
        return re.sub(r'\b\d+h\s\d+m\b', 'Duration', text, flags=re.IGNORECASE)

    def full_clean(self, text, is_title=False, remove_stopwords=True, 
                    lemmatize=True, remove_numbers=False,remove_duration=False,remove_rating=False):
            """
            Master cleaning pipeline with toggleable features. Goes from raw text → squeaky clean.
            
            Params:
            - is_title: Special handling for titles/headers
            - remove_stopwords: Cut common words (the, a, etc)
            - lemmatize: Reduce words to base form
            - remove_numbers: Strip standalone numbers
            """
            try:
                # Phase 1: Structure cleanup
                text = self.json_pattern.sub('', text)  # remove JSON objects
                text = self.math_pattern.sub('Mathemtical Expression', text)  
                
                # Phase 2: Text normalization
                text = self.clean_title(text) if is_title else self.clean_text(text)
                
                # Phase 3: Number cleanup 
                if remove_numbers:
                    text = self.remove_numbers(text)  # Bye-bye lonely digits
                
                # Phase 4: Linguistic processing
                if remove_stopwords:
                    text = ' '.join([word for word in text.split() 
                                if word.lower() not in self.stop_words])  # Filter common words
                
                if lemmatize:  # Base word forms
                    text = self.lemmatize_text(text)
                
                if remove_duration:  # Time formats
                    text = self.clean_durations(text)
                
                if remove_rating:  # Like 1.2M ratings
                    text = self.clean_ratings(text)
                
                return self.multi_space.sub(' ', text).strip()  
            
            except Exception as e:  # Oops, something broke
                logging.error(f"Full clean pipeline glitch: {str(e)}") 
                return text  # Return whatever we have

    def clean_title(self, text):
            """Special handling for titles - extra URL decoding and underscore fixes"""
            cleaned = self.clean_text(text)  # Do normal cleaning first
            cleaned = urllib.parse.unquote(cleaned)  # Convert %20 to spaces etc
            cleaned = re.sub(r"_+", " ", cleaned)  # Underscores → spaces
            return self.multi_space.sub(" ", cleaned).strip()  # Final whitespace pass


## Load DataSets

In [96]:
data1=pd.read_excel("data\\Dataset-1.xlsx")
data1_copy=data1
data2=pd.read_excel("data\\Dataset-2.xlsx")
data2_copy=data2

## Clean DataSets

### Cleaner Objects Intialized

In [97]:
cleaner = TextCleaner()
encoder = OneHotEncoder(sparse_output=False,dtype=np.int8,feature_name_combiner=lambda feature, category: str(category))

### Dataset-1

In [None]:
# remove "ID" Column
data1_copy.drop(columns=["ID"],inplace=True)

#Clean text Data on various parameters
data1_copy['ABSTRACT'] = data1['ABSTRACT'].apply(lambda x: cleaner.full_clean(x, remove_stopwords=True, lemmatize=True))
data1_copy['TITLE'] = data1['TITLE'].apply(lambda x: cleaner.full_clean(x, remove_stopwords=True, lemmatize=True, is_title=True))

#save intermediate form
data1_copy.to_excel("Intermediate\\data1_pass1.xlsx",index=False)

KeyboardInterrupt: 

### Dataset-2

In [None]:
# remove "ID" Column
data2_copy.drop(columns=["ID"],inplace=True)

#Clean text Data on various parameters
data2_copy['Content'] = data2['Content'].apply(
lambda x: cleaner.full_clean(x, remove_stopwords=True, lemmatize=True, remove_numbers=True,remove_duration=True,remove_rating=True)
)

#Change Domain to One-hot Encoding Format
encoded_array = encoder.fit_transform(data2_copy[["Domain"]])
new_columns = encoder.get_feature_names_out()
data2_copy = pd.concat([data2_copy.drop("Domain", axis=1),pd.DataFrame(encoded_array, columns=new_columns)], axis=1)


#save intermediate form
data2_copy.to_excel("Intermediate\\data2_pass1.xlsx",index=False)