# Assignment 2
## Roll Number : CS25MTECH02007
## Name : Rajat Maheshwari
<hr>

# Text Cleaning, and Model Building For Given DataSets

## 📝 Assignment Overview
In this assignment, I will:
1. **Implement** 4 different NLP classification models
2. **Clean and harmonize**  Given Data for model building
3. Perform **text-based Classification** on the cleaned data to extract insights.

# Install Modules

In [3]:
! pip install -r requirements.txt

Collecting scikit-learn (from -r requirements.txt (line 2))
  Downloading scikit_learn-1.6.1-cp312-cp312-win_amd64.whl.metadata (15 kB)
Collecting scipy>=1.6.0 (from scikit-learn->-r requirements.txt (line 2))
  Downloading scipy-1.15.2-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn->-r requirements.txt (line 2))
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.6.1-cp312-cp312-win_amd64.whl (11.1 MB)
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
   --- ------------------------------------ 1.0/11.1 MB 6.3 MB/s eta 0:00:02
   -------- ------------------------------- 2.4/11.1 MB 6.1 MB/s eta 0:00:02
   --------------- ------------------------ 4.2/11.1 MB 7.0 MB/s eta 0:00:01
   ------------------- -------------------- 5.5/11.1 MB 6.7 MB/s eta 0:00:01
   ------------------------- -------------- 7.1/11.1 MB 7.2 MB/s eta 0:00:01
   --------------------------------- ------ 9.2/


[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


# Necessary Imports

In [63]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np
import re
import urllib
import unicodedata
import urllib.parse
import logging
import nltk

## NLTK Downloads

In [35]:
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\rajat\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rajat\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rajat\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\rajat\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rajat\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\rajat\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger_eng.zip.


True

## NLTK Imports

In [36]:
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

## Text Cleaner Class

In [None]:
class TextCleaner:
    def __init__(self):
        # Initialize regex patterns directly in __init__
        self.json_pattern = re.compile(r'\{[^}]+\}', re.DOTALL)
        self.math_pattern = re.compile(r'\$.*?\$', re.DOTALL)
        self.control_chars = re.compile(r"[\x00-\x09\x0B\x0C\x0E-\x1F\x7F-\x9F]")
        self.extra_newlines = re.compile(r'\n{9,}')
        self.game_word = re.compile(r'\bgame\b', re.IGNORECASE)
        self.non_word_chars = re.compile(r"[^\w\s]")
        self.multi_space = re.compile(r" +")

        # Initialize NLTK components directly
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))

    def _get_wordnet_pos(self, treebank_tag):
        return {
            'J': wordnet.ADJ,
            'V': wordnet.VERB,
            'R': wordnet.ADV
        }.get(treebank_tag[0], wordnet.NOUN)

    def lemmatize_text(self, text, use_pos=True):
        """Perform lemmatization with POS tagging"""
        try:
            tokens = word_tokenize(text)
            if use_pos:
                pos_tags = nltk.pos_tag(tokens)
                return ' '.join([
                    self.lemmatizer.lemmatize(word, self._get_wordnet_pos(tag))
                    for word, tag in pos_tags
                ])
            return ' '.join([self.lemmatizer.lemmatize(word) for word in tokens])
        except Exception as e:
            logging.error(f"Lemmatization error: {str(e)}")
            return text

    def remove_numbers(self, text):
        """Remove standalone numbers while preserving alphanumeric terms"""
        # Remove full numeric tokens but keep alphanumerics like 'COVID19'
        return re.sub(r'\b\d+\b', '', text)

    def clean_text(self, text):
        """Enhanced text cleaning pipeline"""
        try:
            text = self.control_chars.sub(" ", text)
            text = unicodedata.normalize("NFKD", text)
            text = text.replace("\u2022", "\n- ").replace("\xa0", " ")
            text = self.non_word_chars.sub(" ", text)
            text = text.encode("ascii", "ignore").decode("utf-8")
            text = self.game_word.sub("", text)
            text = self.extra_newlines.sub("\n\n", text)
            return self.multi_space.sub(" ", text).strip()
        except Exception as e:
            logging.error(f"Cleaning error: {str(e)}")
            return text
        
    def clean_ratings(self,text):
        """Replace patterns like 12M/128k with <RATING_COUNT>"""
        return re.sub(r'\b\d+[MK]\b', 'COUNT', text, flags=re.IGNORECASE)
    
    def clean_durations(text):
        """Replace Xh Ym patterns with Duration"""
        return re.sub(r'\b\d+h\s\d+m\b', 'Duration', text, flags=re.IGNORECASE)

    def full_clean(self, text, is_title=False, remove_stopwords=True, 
                    lemmatize=True, remove_numbers=False,remove_duration=False,remove_rating=False):
            """
            Enhanced pipeline with number removal
            Parameters control processing stages
            """
            try:
                # Structural cleaning
                text = self.json_pattern.sub('', text)
                text = self.math_pattern.sub('Mathematical expression', text)
                
                # Text normalization
                text = self.clean_title(text) if is_title else self.clean_text(text)
                
                # Numerical cleaning
                if remove_numbers:
                    text = self.remove_numbers(text)
                
                # Linguistic processing
                if remove_stopwords:
                    text = ' '.join([word for word in text.split() 
                                if word.lower() not in self.stop_words])
                
                if lemmatize:
                    text = self.lemmatize_text(text)
                
                if remove_duration:
                    text = self.clean_durations(text)
                
                if remove_rating:
                    text = self.clean_ratings(text)
                
                return self.multi_space.sub(' ', text).strip()
            
            except Exception as e:
                logging.error(f"Cleaning pipeline error: {str(e)}")
                return text
            
            except Exception as e:
                logging.error(f"Full pipeline error: {str(e)}")
                return text

    def clean_title(self, text):
            """Specialized title cleaning"""
            cleaned = self.clean_text(text)
            cleaned = urllib.parse.unquote(cleaned)
            cleaned = re.sub(r"_+", " ", cleaned)
            return self.multi_space.sub(" ", cleaned).strip()


## Load DataSets

In [77]:
data1=pd.read_excel("data\\Dataset-1.xlsx")
data1_copy=data1
data2=pd.read_excel("data\\Dataset-2.xlsx")
data2_copy=data2

## Clean DataSets

### Cleaner Objects Intialized

In [79]:
cleaner = TextCleaner()
encoder = OneHotEncoder(sparse_output=False,dtype=np.int8,feature_name_combiner=lambda feature, category: str(category))

In [80]:
data1_copy.drop(columns=["ID"],inplace=True)

data1_copy['ABSTRACT'] = data1['ABSTRACT'].apply(
    lambda x: cleaner.full_clean(x, remove_stopwords=True, lemmatize=True)
)
data1_copy['TITLE'] = data1['TITLE'].apply(
    lambda x: cleaner.full_clean(x, remove_stopwords=True, lemmatize=True, is_title=True)
)
data1_copy.to_excel("Intermediate\\data1_pass1.xlsx",index=False)

In [82]:

#data2_copy.drop(columns=["ID"],inplace=True)

# For custom function with parameters

data2_copy['Content'] = data2['Content'].apply(
    lambda x: cleaner.full_clean(x, remove_stopwords=True, lemmatize=True, remove_numbers=True)
)
encoded_array = encoder.fit_transform(data2_copy[["Domain"]])
new_columns = encoder.get_feature_names_out()
data2_copy = pd.concat([
    data2_copy.drop("Domain", axis=1),
    pd.DataFrame(encoded_array, columns=new_columns)
], axis=1)

data2_copy.to_excel("Intermediate\\data2_pass2.xlsx",index=False)

In [76]:
data2_copy.columns

Index(['Content', 'Domain'], dtype='object')