<a href="https://colab.research.google.com/github/Sankytanky100/software-engineering/blob/main/Text_Classification_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

*Project Overview*

**Text Classification Pipeline with OOP in Python**

Objective:
Data Loading and Preprocessing: Clean and preprocess text data.

Feature Extraction: Convert text data into numerical features using TF-IDF.

Model Building: Train a classification model (e.g., Logistic Regression).

Model Evaluation: Evaluate the model's performance.

Pipeline Management: Use OOP to create reusable and modular code components.
Why This Project?


In [1]:
# Install necessary packages
!pip install scikit-learn

# Import libraries
import numpy as np
import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix




In [2]:
class DataHandler:
    """
    A class to handle data loading and initial preprocessing.
    """
    def __init__(self, data_source):
        self.data_source = data_source
        self.data = None

    def load_data(self):
        """
        Loads data from the data source.
        """
        raise NotImplementedError("Subclass must implement abstract method")


In [13]:
class CSVDataHandler:
    def __init__(self, data_source):
        self.data_source = data_source
        self.data = None

    def load_data(self):
        """
        Loads data from a CSV file.
        """
        # Change from pd.read_csv to pd.read_table or pd.read_csv(..., sep='\t')
        # self.data = pd.read_csv(self.data_source)  # Original line
        self.data = pd.read_table(self.data_source, header=None, names=['label', 'text'])  # Changed line to pd.read_table
        # or
        # self.data = pd.read_csv(self.data_source, sep='\t')  # Alternative using pd.read_csv with sep='\t'
        print("Data loaded successfully from {}".format(self.data_source))
        return self.data


In [14]:
class TextPreprocessor:
    """
    A class for text preprocessing.
    """
    def __init__(self):
        pass

    def clean_text(self, text):
        """
        Cleans the input text.
        """
        # Remove URLs
        text = re.sub(r'http\S+', '', text)
        # Remove HTML tags
        text = re.sub(r'<.*?>', '', text)
        # Remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))
        # Remove numbers
        text = re.sub(r'\d+', '', text)
        # Remove extra whitespace
        text = text.strip()
        return text.lower()

    def preprocess(self, texts):
        """
        Applies cleaning to a list of texts.
        """
        return [self.clean_text(text) for text in texts]


In [15]:
class FeatureExtractor:
    """
    A class for feature extraction from text data.
    """
    def __init__(self):
        self.vectorizer = TfidfVectorizer()

    def fit_transform(self, texts):
        """
        Fits the vectorizer to the texts and transforms them.
        """
        return self.vectorizer.fit_transform(texts)

    def transform(self, texts):
        """
        Transforms texts using the fitted vectorizer.
        """
        return self.vectorizer.transform(texts)


In [16]:
class ModelTrainer:
    """
    A class for training the classification model.
    """
    def __init__(self, model=None):
        if model is None:
            self.model = LogisticRegression()
        else:
            self.model = model

    def train(self, X_train, y_train):
        """
        Trains the model.
        """
        self.model.fit(X_train, y_train)
        print("Model trained successfully.")
        return self.model


In [17]:
class ModelEvaluator:
    """
    A class for evaluating the classification model.
    """
    def __init__(self):
        pass

    def evaluate(self, model, X_test, y_test):
        """
        Evaluates the model on the test data.
        """
        y_pred = model.predict(X_test)
        print("Classification Report:\n")
        print(classification_report(y_test, y_pred))
        print("Confusion Matrix:\n")
        print(confusion_matrix(y_test, y_pred))


In [18]:
class TextClassificationPipeline:
    """
    A class that encapsulates the entire text classification pipeline.
    """
    def __init__(self, data_handler, preprocessor, feature_extractor, model_trainer, evaluator):
        self.data_handler = data_handler
        self.preprocessor = preprocessor
        self.feature_extractor = feature_extractor
        self.model_trainer = model_trainer
        self.evaluator = evaluator

    def run(self):
        # Load data
        data = self.data_handler.load_data()

        # Assume the data has 'text' and 'label' columns
        texts = data['text']
        labels = data['label']

        # Preprocess text
        texts = self.preprocessor.preprocess(texts)

        # Split data
        X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

        # Feature extraction
        X_train_feats = self.feature_extractor.fit_transform(X_train)
        X_test_feats = self.feature_extractor.transform(X_test)

        # Train model
        model = self.model_trainer.train(X_train_feats, y_train)

        # Evaluate model
        self.evaluator.evaluate(model, X_test_feats, y_test)


In [19]:
# Download the dataset
!wget https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv


--2024-11-19 00:22:22--  https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 477907 (467K) [text/plain]
Saving to: ‘sms.tsv.1’


2024-11-19 00:22:22 (16.0 MB/s) - ‘sms.tsv.1’ saved [477907/477907]



In [20]:
# Load data
data_source = 'sms.tsv'
data = pd.read_table(data_source, header=None, names=['label', 'text'])

# Map labels to binary values
data['label'] = data['label'].map({'ham': 0, 'spam': 1})


In [21]:
# Instantiate components
data_handler = CSVDataHandler(data_source)
preprocessor = TextPreprocessor()
feature_extractor = FeatureExtractor()
model_trainer = ModelTrainer()
evaluator = ModelEvaluator()


In [22]:
# Instantiate the pipeline
pipeline = TextClassificationPipeline(
    data_handler=data_handler,
    preprocessor=preprocessor,
    feature_extractor=feature_extractor,
    model_trainer=model_trainer,
    evaluator=evaluator
)

# Run the pipeline
pipeline.run()


Data loaded successfully from sms.tsv
Model trained successfully.
Classification Report:

              precision    recall  f1-score   support

         ham       0.97      1.00      0.98       966
        spam       0.99      0.79      0.88       149

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115

Confusion Matrix:

[[965   1]
 [ 32 117]]
