In [None]:
from sklearn.base import BaseEstimator, ClassifierMixin
import numpy as np
from sklearn.utils import check_X_y

### Custom Estimator

In [None]:
class MostFrequentClassClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self):
        self.most_frequent_ = None

    def fit(self, X, y):

        # Validate input X and target vector y
        X, y = check_X_y(X, y)

        # Ensure y is 1D
        y = np.ravel(y)

        # Manually compute the most frequent class
        unique_classes, counts = np.unique(y, return_counts=True)
        self.most_frequent_ = unique_classes[np.argmax(counts)]

        return self

    def predict(self, X):
        if self.most_frequent_ is None:
            raise ValueError("This classifier instance is not fitted yet.")
        # Predict the most frequent class for each input sample
        return np.full(shape=(X.shape[0],), fill_value=self.most_frequent_)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris

# Load data
iris = load_iris()
X, y = iris.data, iris.target

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Initialize and fit the custom estimator
classifier = MostFrequentClassClassifier()
classifier.fit(X_train, y_train)

# Make predictions
#predictions = classifier.predict(X_test)

# Evaluate the custom estimator
print(f"Predicted class for all test instances: {predictions[0]}")


Predicted class for all test instances: 1


In [None]:
classifier.most_frequent_

1

In [None]:
from sklearn.model_selection import cross_val_score

cross_val_score(classifier, X_train, y_train)

array([0.34782609, 0.34782609, 0.31818182, 0.36363636, 0.36363636])

### Scoing function

In [None]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import accuracy_score
import numpy as np

class MostFrequentClassClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self):
        self.most_frequent_ = None

    def fit(self, X, y):
        # Ensure y is 1D
        y = np.ravel(y)

        # Compute the most frequent class
        unique_classes, counts = np.unique(y, return_counts=True)
        self.most_frequent_ = unique_classes[np.argmax(counts)]
        return self

    def predict(self, X):
        if self.most_frequent_ is None:
            raise ValueError("This classifier instance is not fitted yet.")
        # Predict the most frequent class for each input sample
        return np.full(shape=(X.shape[0],), fill_value=self.most_frequent_)

    def score(self, X, y):
        """Return the mean accuracy on the given test data and labels."""
        # Ensure y is 1D
        y = np.ravel(y)

        # Generate predictions
        predictions = self.predict(X)

        # Calculate and return the accuracy
        return accuracy_score(y, predictions)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris

# Load a dataset
iris = load_iris()
X, y = iris.data, iris.target

# Simplify to a binary classification problem
is_class_0_or_1 = y < 2
X_bin = X[is_class_0_or_1]
y_bin = y[is_class_0_or_1]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_bin, y_bin, test_size=0.2, random_state=42)

# Initialize and fit the custom classifier
classifier = MostFrequentClassClassifier()
classifier.fit(X_train, y_train)

# Evaluate the classifier using the score method
score = classifier.score(X_test, y_test)
print(f"Accuracy of the MostFrequentClassClassifier: {score}")


Accuracy of the MostFrequentClassClassifier: 0.4


### Transformers

In [None]:
from sklearn.datasets import make_regression
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

# Generate some data
X, y = make_regression(n_samples=100, n_features=2, noise=0.1, random_state=42)

# Use the transformer directly
X_transformed = StandardScaler().fit_transform(X)


LinearRegression().fit(X_transformed, y)


### Custom Transformer using Function Transformer

In [None]:
import numpy as np

def cube(x):

    return np.power(x,3)


In [None]:
from sklearn.preprocessing import FunctionTransformer

# Create the custom transformer
cube_transformer = FunctionTransformer(cube)


In [None]:
from sklearn.datasets import make_regression
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

# Generate some data
X, y = make_regression(n_samples=100, n_features=2, noise=0.1, random_state=42)

# Use the transformer directly
X_transformed = cube_transformer.transform(X)

LinearRegression().fit(X_transformed, y)


### Custom Transformer using BaseEstimator and TransformerMixin

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

In [None]:
class MedianIQRScaler(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.medians_ = None
        self.iqr_ = None

    def fit(self, X, y=None):
        # Calculate medians and interquartile range for each feature
        self.medians_ = np.median(X, axis=0)
        Q1 = np.percentile(X, 25, axis=0)
        Q3 = np.percentile(X, 75, axis=0)
        self.iqr_ = Q3 - Q1

        # Handle case where IQR is 0 to avoid division by zero during transform
        self.iqr_[self.iqr_ == 0] = 1
        return self

    def transform(self, X):
        # Check if fit has been called
        if self.medians_ is None or self.iqr_ is None:
            raise RuntimeError("The transformer has not been fitted yet.")

        # Scale features using median and IQR learned during fit
        return (X - self.medians_) / self.iqr_


In [None]:
from sklearn.datasets import make_blobs

# Generate synthetic data
X, _ = make_blobs(n_samples=100, n_features=2, centers=3, random_state=42)

# Initialize the transformer
scaler = MedianIQRScaler()

# Fit the scaler to the data
scaler.fit(X)

# Transform the data
X_scaled = scaler.transform(X)

# Check the first few rows of the transformed data
print("Transformed data (first 5 rows):")
print(X_scaled[:5])


Transformed data (first 5 rows):
[[-0.49872679 -0.71613207]
 [ 0.78423675 -0.08192868]
 [-0.03656645  0.52987512]
 [ 0.84159877 -0.09379661]
 [-0.3814692  -0.57206564]]


### Column Transformer

In [None]:
import pandas as pd

# Define the data with numeric labels for sentiment
data = {
    "Social Media Platform": ["Twitter", "Facebook", "Instagram", "Twitter", "Facebook",
                              "Instagram", "Twitter", "Facebook", "Instagram", "Twitter"],
    "Review": ["Love the new update!", "Too many ads now", "Great for sharing photos",
               "Newsfeed algorithm is biased", "Privacy concerns with latest update",
               "Amazing filters!", "Too much spam", "Easy to connect with friends",
               "Stories feature is fantastic", "Customer support lacking"],
    "age": [21, 19, np.nan, 17, 24, np.nan, 30, 19, 16, 31],
    "Sentiment": [1, 0, 1, 0, 0, 1, 0, 1, 1, 0]  # Numeric labels: 1 for Positive, 0 for Negative
}

# Create a DataFrame
df = pd.DataFrame(data)

print(df)

  Social Media Platform                               Review   age  Sentiment
0               Twitter                 Love the new update!  21.0          1
1              Facebook                     Too many ads now  19.0          0
2             Instagram             Great for sharing photos   NaN          1
3               Twitter         Newsfeed algorithm is biased  17.0          0
4              Facebook  Privacy concerns with latest update  24.0          0
5             Instagram                     Amazing filters!   NaN          1
6               Twitter                        Too much spam  30.0          0
7              Facebook         Easy to connect with friends  19.0          1
8             Instagram         Stories feature is fantastic  16.0          1
9               Twitter             Customer support lacking  31.0          0


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

In [None]:
# Define the column transformer
column_transformer = ColumnTransformer(
    transformers=[
        ('platform_ohe', OneHotEncoder(), ['Social Media Platform']),
        ('review_bow', CountVectorizer(), 'Review'),
        ('age_impute', SimpleImputer(),['age'])
    ],
    remainder='drop'  # Drop other columns not specified in transformers
)

This code snippet defines a ColumnTransformer object from scikit-learn. This is a powerful tool for applying different transformations to different columns of your data within a single step.

Here's a breakdown of what's happening:

- column_transformer = ColumnTransformer(...): This line creates an instance of the ColumnTransformer.
- transformers=[...]: This is a list of tuples, where each tuple defines a transformation to be applied to specific columns:
  - ('platform_ohe', OneHotEncoder(), ['Social Media Platform']): This applies OneHotEncoder to the 'Social Media Platform' column. One-hot encoding is used for categorical features, converting each category into a new binary column.
  - ('review_bow', CountVectorizer(), 'Review'): This applies CountVectorizer to the 'Review' column. CountVectorizer converts a collection of text documents to a matrix of token counts, essentially creating a "bag of words" representation.
  - ('age_impute', SimpleImputer(),['age']): This applies SimpleImputer to the 'age' column. SimpleImputer is used to fill in missing values (like NaN in the example data) with a specified strategy (by default, it uses the mean of the column).
- remainder='drop': This argument specifies what to do with the columns that are not listed in the transformers list. In this case, drop means that any columns in the input DataFrame that are not 'Social Media Platform', 'Review', or 'age' will be dropped from the transformed output.


In essence, this ColumnTransformer is set up to preprocess the 'Social Media Platform', 'Review', and 'age' columns for use in a machine learning model, while discarding any other columns.

In [None]:
pd.DataFrame(column_transformer.fit_transform(df).toarray(),columns=column_transformer.get_feature_names_out())

Unnamed: 0,platform_ohe__Social Media Platform_Facebook,platform_ohe__Social Media Platform_Instagram,platform_ohe__Social Media Platform_Twitter,review_bow__ads,review_bow__algorithm,review_bow__amazing,review_bow__biased,review_bow__concerns,review_bow__connect,review_bow__customer,...,review_bow__sharing,review_bow__spam,review_bow__stories,review_bow__support,review_bow__the,review_bow__to,review_bow__too,review_bow__update,review_bow__with,age_impute__age
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,21.0
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,19.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.125
3,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,24.0
5,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.125
6,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,30.0
7,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,19.0
8,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0
9,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,31.0


This code snippet applies the ColumnTransformer you defined earlier to your DataFrame and then converts the output to a pandas DataFrame for better readability.

Here's a breakdown:

- column_transformer.fit_transform(df.to_array()): This is the core of the code. It does two things:
  - fit: It learns the necessary parameters for each transformation defined in the ColumnTransformer (e.g., the vocabulary for CountVectorizer, the mean for SimpleImputer, and the categories for OneHotEncoder) based on the input DataFrame df.
  - transform: It then applies these learned transformations to the DataFrame. The .to_array() part is used because the output of the ColumnTransformer, especially with CountVectorizer, can be a sparse matrix, and .to_array() converts it to a dense NumPy array.
- pd.DataFrame(...): This wraps the output of the fit_transform method in a pandas DataFrame. This makes the transformed data easier to view and work with, as it provides column headers and a clear structure.
- columns=column_transformer.get_feature_names_out(): This assigns meaningful column names to the new DataFrame. The get_feature_names_out() method of the ColumnTransformer generates names based on the transformer names and the original column names, helping you understand which transformed feature corresponds to which original column and transformation.


In summary, this code executes the preprocessing steps defined in your ColumnTransformer on the DataFrame and presents the result as a structured pandas DataFrame.

### Feature Union

In [None]:
import pandas as pd
import numpy as np

# Generating a random dataset with 10 rows and 4 columns
np.random.seed(42)  # For reproducibility
data = np.random.randn(10, 4)

# Creating a DataFrame and naming the columns
df = pd.DataFrame(data, columns=['f1', 'f2', 'f3', 'y'])

df

Unnamed: 0,f1,f2,f3,y
0,0.496714,-0.138264,0.647689,1.52303
1,-0.234153,-0.234137,1.579213,0.767435
2,-0.469474,0.54256,-0.463418,-0.46573
3,0.241962,-1.91328,-1.724918,-0.562288
4,-1.012831,0.314247,-0.908024,-1.412304
5,1.465649,-0.225776,0.067528,-1.424748
6,-0.544383,0.110923,-1.150994,0.375698
7,-0.600639,-0.291694,-0.601707,1.852278
8,-0.013497,-1.057711,0.822545,-1.220844
9,0.208864,-1.95967,-1.328186,0.196861


In [None]:
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import PCA

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import PCA

# Define FeatureUnion
feature_union = FeatureUnion([
    ('scaler', StandardScaler()),  # Apply StandardScaler
    ('pca', PCA(n_components=2))   # Apply PCA, reduce to 2 components
])

In [None]:
X_transformed = feature_union.fit_transform(df.drop(columns=['y']))

pd.DataFrame(X_transformed, columns=feature_union.get_feature_names_out())

Unnamed: 0,scaler__f1,scaler__f2,scaler__f3,pca__pca0,pca__pca1
0,0.815293,0.41836,0.947878,1.025659,-0.425413
1,-0.282292,0.302777,1.873701,1.772532,-0.358223
2,-0.635686,1.239158,-0.156427,0.327888,1.038742
3,0.432718,-1.721587,-1.410206,-1.911072,-0.68996
4,-1.451676,0.963905,-0.598312,-0.193153,1.371662
5,2.270396,0.312856,0.371269,0.51176,-0.891133
6,-0.74818,0.718778,-0.839795,-0.48428,1.020731
7,-0.832663,0.233387,-0.29387,-0.191723,0.583958
8,0.04908,-0.690119,1.121664,0.726878,-0.811461
9,0.383011,-1.777515,-1.015903,-1.584488,-0.838903


### Pipeline

In [None]:
import pandas as pd
import numpy as np

# Generating a random dataset with 10 rows and 4 columns
np.random.seed(42)  # For reproducibility
data = np.random.randn(10, 4)

# Creating a DataFrame and naming the columns
df = pd.DataFrame(data, columns=['f1', 'f2', 'f3', 'y'])

df

Unnamed: 0,f1,f2,f3,y
0,0.496714,-0.138264,0.647689,1.52303
1,-0.234153,-0.234137,1.579213,0.767435
2,-0.469474,0.54256,-0.463418,-0.46573
3,0.241962,-1.91328,-1.724918,-0.562288
4,-1.012831,0.314247,-0.908024,-1.412304
5,1.465649,-0.225776,0.067528,-1.424748
6,-0.544383,0.110923,-1.150994,0.375698
7,-0.600639,-0.291694,-0.601707,1.852278
8,-0.013497,-1.057711,0.822545,-1.220844
9,0.208864,-1.95967,-1.328186,0.196861


In [None]:
X = df.drop(columns = ["y"])
y = df["y"]

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
# Define FeatureUnion
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Apply StandardScaler
    ('pca', PCA(n_components=2))
])

In [None]:
pipeline.fit_transform(X)

array([[ 0.25940603,  1.27500545],
       [ 1.07464482,  1.25582778],
       [ 1.22739912, -0.45015684],
       [-2.07922037, -0.9000351 ],
       [ 1.26713401, -1.33294108],
       [-0.78032119,  1.77643911],
       [ 0.62783735, -1.07499308],
       [ 0.4974709 , -0.74814762],
       [-0.14358198,  0.83630209],
       [-1.95076869, -0.63730071]])

In [None]:
pd.DataFrame(pipeline.fit_transform(X),columns = pipeline.get_feature_names_out())

Unnamed: 0,pca0,pca1
0,0.259406,1.275005
1,1.074645,1.255828
2,1.227399,-0.450157
3,-2.07922,-0.900035
4,1.267134,-1.332941
5,-0.780321,1.776439
6,0.627837,-1.074993
7,0.497471,-0.748148
8,-0.143582,0.836302
9,-1.950769,-0.637301


### Slightly Complex Example - Which applies all three concepts[column transfer, feature union , pipeline]

In [None]:
import pandas as pd

# Define the data with numeric labels for sentiment
data = {
    "Social Media Platform": ["Twitter", "Facebook", "Instagram", "Twitter", "Facebook",
                              "Instagram", "Twitter", "Facebook", "Instagram", "Twitter"],
    "Review": ["Love the new update!", "Too many ads now", "Great for sharing photos",
               "Newsfeed algorithm is biased", "Privacy concerns with latest update",
               "Amazing filters!", "Too much spam", "Easy to connect with friends",
               "Stories feature is fantastic", "Customer support lacking"],
    "age": [21, 19, np.nan, 17, 24, np.nan, 30, 19, 16, 31],
    "Sentiment": [1, 0, 1, 0, 0, 1, 0, 1, 1, 0]  # Numeric labels: 1 for Positive, 0 for Negative
}

# Create a DataFrame
df = pd.DataFrame(data)

print(df)

  Social Media Platform                               Review   age  Sentiment
0               Twitter                 Love the new update!  21.0          1
1              Facebook                     Too many ads now  19.0          0
2             Instagram             Great for sharing photos   NaN          1
3               Twitter         Newsfeed algorithm is biased  17.0          0
4              Facebook  Privacy concerns with latest update  24.0          0
5             Instagram                     Amazing filters!   NaN          1
6               Twitter                        Too much spam  30.0          0
7              Facebook         Easy to connect with friends  19.0          1
8             Instagram         Stories feature is fantastic  16.0          1
9               Twitter             Customer support lacking  31.0          0


In [None]:
def count_words(reviews):
    # Count the number of words in each review
    # Assuming reviews is a 1D array-like of text strings
    return np.array([len(review.split()) for review in reviews]).reshape(-1, 1)

In [None]:
from sklearn.preprocessing import FunctionTransformer

# Create the FunctionTransformer using the count_words function
word_count_transformer = FunctionTransformer(count_words)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import FunctionTransformer
import numpy as np


feature_union = FeatureUnion([
    ('word_count', word_count_transformer),
    ('bag_of_words', CountVectorizer())
])

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
import numpy as np


column_transformer = ColumnTransformer(
    transformers=[
        ('age_imputer', SimpleImputer(strategy='mean'), ['age']),
        ('platform_ohe', OneHotEncoder(), ['Social Media Platform']),
        ('review_processing', feature_union, 'Review')
    ],
    remainder='drop'  # Drop other columns not specified here
)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MaxAbsScaler
from sklearn.feature_selection import SelectKBest,chi2

final_pipeline = Pipeline(steps=[
    ('col_transformer', column_transformer),
    ('scaler', MaxAbsScaler()),
    ('selector', SelectKBest(score_func=chi2,k=10)),
    ('classifier', LogisticRegression())
])

In [None]:
final_pipeline.fit(df.drop(columns=['Sentiment']), df['Sentiment'])

You're working with a **complete machine learning pipeline** built using `scikit-learn`. This pipeline prepares a small dataset of **social media reviews**, extracts meaningful features (including text), and trains a **Logistic Regression classifier** to predict **sentiment** (positive or negative). Let's break it down step by step so it's super clear.

---

### 🔸 1. **Dataset Creation**

```python
import pandas as pd
import numpy as np

data = { ... }  # dictionary with social media platform, review, age, sentiment
df = pd.DataFrame(data)
```

You're creating a **Pandas DataFrame** with the following columns:

* `"Social Media Platform"` (categorical): e.g., Twitter, Facebook
* `"Review"` (text): short reviews
* `"age"` (numerical): may contain missing values (NaNs)
* `"Sentiment"` (target): binary classification — 1 = Positive, 0 = Negative

---

### 🔸 2. **Word Count Function**

```python
def count_words(reviews):
    return np.array([len(review.split()) for review in reviews]).reshape(-1, 1)
```

This function calculates the **number of words in each review**.

---

### 🔸 3. **FunctionTransformer**

```python
from sklearn.preprocessing import FunctionTransformer

word_count_transformer = FunctionTransformer(count_words)
```

This wraps the `count_words()` function so it can be used inside a pipeline just like a regular transformer.

---

### 🔸 4. **FeatureUnion for Review Column**

```python
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import FeatureUnion

feature_union = FeatureUnion([
    ('word_count', word_count_transformer),          # Custom feature: number of words
    ('bag_of_words', CountVectorizer())              # Standard text vectorizer: bag-of-words
])
```

Here you're **combining multiple features extracted from the "Review" column**:

* `word_count`: how long the review is (basic signal)
* `bag_of_words`: sparse word-frequency vectors from `CountVectorizer`

⚡ `FeatureUnion` merges them **horizontally**, i.e., `word_count` + `BoW` = one final review representation.

---

### 🔸 5. **ColumnTransformer for Entire Data**

```python
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

column_transformer = ColumnTransformer(
    transformers=[
        ('age_imputer', SimpleImputer(strategy='mean'), ['age']),               # fill missing ages
        ('platform_ohe', OneHotEncoder(), ['Social Media Platform']),           # one-hot encode platform
        ('review_processing', feature_union, 'Review')                          # apply union to reviews
    ],
    remainder='drop'
)
```

This handles **different types of data** in parallel:

* Imputes missing values in `age`
* One-hot encodes the `Social Media Platform`
* Processes the `Review` using the custom feature union

---

### 🔸 6. **Final Pipeline: Preprocessing + Model**

```python
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MaxAbsScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline

final_pipeline = Pipeline(steps=[
    ('col_transformer', column_transformer),                # Step 1: preprocess all columns
    ('scaler', MaxAbsScaler()),                             # Step 2: scale values (especially sparse ones)
    ('selector', SelectKBest(score_func=chi2, k=10)),       # Step 3: select top 10 features using chi-square
    ('classifier', LogisticRegression())                    # Step 4: train a classifier
])
```

💡 This is a **modular ML pipeline**:

1. **Preprocessing:** Clean and transform all columns
2. **Scaling:** Normalize data to \[0,1] range without affecting sparse matrices
3. **Feature Selection:** Keep only the top 10 best features
4. **Model Training:** Train a logistic regression model

---

### 🔸 7. **Training the Model**

```python
final_pipeline.fit(df.drop(columns=['Sentiment']), df['Sentiment'])
```

You're training the full pipeline on:

* Features = all columns **except** the target `Sentiment`
* Target = `Sentiment` (0 or 1)

---

### 📌 Summary: What is this doing?

You're building a clean, production-ready ML pipeline that:

* Handles missing values and text features
* Extracts both manual and automatic features from reviews
* Combines numerical, categorical, and text-based features
* Performs feature selection
* Trains a classifier (Logistic Regression)

---

### ✅ Example Use Case

Suppose you work at a startup analyzing **social media reviews**, and you want to **automatically detect positive or negative sentiment**. This pipeline allows you to train a robust model with just a few lines of code.

Would you like a **visual diagram** or **prediction example** from this code as well?


Absolutely, PD! Here's a clear breakdown of the **model structure (pipeline architecture)** for the code you shared — like a *blueprint* of how the data flows and transforms through different layers before prediction.

---

### 🔧 **Model Structure: End-to-End Pipeline**

```text
Input:
    ├── "Social Media Platform" (Categorical)
    ├── "Review" (Text)
    └── "age" (Numeric, with missing values)

Pipeline:
┌────────────────────────────────────────────────────────────────────┐
│                         final_pipeline                            │
│                                                                    │
│ 1. col_transformer: ColumnTransformer                              │
│                                                                    │
│    ├── 'age_imputer' → SimpleImputer(strategy='mean')             │
│    │       Input: ['age']                                          │
│    │       Output: [filled age]                                    │
│    │                                                               │
│    ├── 'platform_ohe' → OneHotEncoder()                           │
│    │       Input: ['Social Media Platform']                        │
│    │       Output: [Twitter, Facebook, Instagram] (One-hot)        │
│    │                                                               │
│    └── 'review_processing' → FeatureUnion                          │
│            Input: ['Review']                                       │
│            Output:                                                │
│                ├─ 'word_count' → FunctionTransformer(count_words) │
│                │      Output: single integer feature               │
│                └─ 'bag_of_words' → CountVectorizer()              │
│                       Output: sparse matrix of word counts         │
│                                                                    │
│    ───────── Output: Combined features (Numerical + OHE + Text) ───│
│                                                                    │
│ 2. scaler: MaxAbsScaler()                                          │
│       Normalizes all features into range [0,1]                     │
│                                                                    │
│ 3. selector: SelectKBest(chi2, k=10)                               │
│       Keeps top 10 statistically relevant features                 │
│                                                                    │
│ 4. classifier: LogisticRegression()                                │
│       Learns to predict Sentiment (0 = Negative, 1 = Positive)     │
└────────────────────────────────────────────────────────────────────┘

Output:
    └── Predicted Sentiment (0 or 1)
```

---

### 📌 Dimensions at Each Step (Assuming 10 Records)

| Stage                      | Output Shape (Approx.) | Description                    |
| -------------------------- | ---------------------- | ------------------------------ |
| Age Imputer                | (10, 1)                | One value per row              |
| Platform OHE               | (10, 3)                | 3 platforms = 3 binary columns |
| Word Count                 | (10, 1)                | One feature per review         |
| CountVectorizer            | (10, N)                | N = number of unique words     |
| Feature Union Output       | (10, 1 + N)            | Word count + Bag of Words      |
| Column Transformer Output  | (10, 1 + 3 + 1 + N)    | All combined features          |
| After Scaling              | (10, ...)              | Same shape, normalized         |
| After SelectKBest(k=10)    | (10, 10)               | Top 10 selected features       |
| Logistic Regression Output | (10,)                  | Final prediction: 0 or 1       |

---

### 🧠 What Does the Model Learn?

* Logistic Regression learns to assign **weights** to:

  * Age (numeric)
  * Platform type (categorical)
  * Review features (word count + keywords)
* It tries to find patterns that **best separate positive vs negative sentiment**.

---

Would you like me to visualize this pipeline as a flowchart image as well?


### Common Text Preprocessing Techniques

Here are some common text preprocessing techniques and their typical use cases:

1.  **Tokenization:**
    *   **What it is:** Breaking down text into smaller units called tokens (usually words or sub-word units).
    *   **When to apply:** Almost always the first step in text processing. It's essential for converting raw text into a format that can be processed by models.
    *   **Where to apply:** Early in the pipeline, before other text-specific transformations.

2.  **Lowercasing:**
    *   **What it is:** Converting all text to lowercase.
    *   **When to apply:** When the case of a word doesn't carry significant meaning and you want to treat "Hello" and "hello" as the same word. This helps reduce the vocabulary size.
    *   **Where to apply:** After tokenization and before techniques like stemming or lemmatization.

3.  **Removing Punctuation:**
    *   **What it is:** Removing punctuation marks from the text.
    *   **When to apply:** When punctuation doesn't contribute to the meaning of the text for your specific task. This can help reduce noise.
    *   **Where to apply:** After tokenization. Be mindful if punctuation is important for sentiment or other tasks.

4.  **Removing Stop Words:**
    *   **What it is:** Removing common words that don't carry much meaning (e.g., "the", "a", "is", "in").
    *   **When to apply:** When these common words are not important for your analysis (e.g., in text classification where you want to focus on more informative words).
    *   **Where to apply:** After tokenization and lowercasing.

5.  **Stemming:**
    *   **What it is:** Reducing words to their root form (e.g., "running", "runs", "ran" become "run"). This is a crude process that might not result in actual words.
    *   **When to apply:** When you want to group words with similar meanings based on their root, and a less linguistically accurate approach is acceptable. It's faster than lemmatization.
    *   **Where to apply:** After tokenization, lowercasing, and stop word removal.

6.  **Lemmatization:**
    *   **What it is:** Reducing words to their base or dictionary form (e.g., "running", "runs", "ran" become "run"; "better" becomes "good"). This is a more linguistically informed process than stemming.
    *   **When to apply:** When you need a more accurate reduction of words to their base form and can afford the computational cost.
    *   **Where to apply:** After tokenization, lowercasing, and stop word removal. Requires a lexicon or dictionary.

7.  **Bag-of-Words (BoW):**
    *   **What it is:** Representing text as a vector of word counts.
    *   **When to apply:** A simple and effective method for many text classification tasks, especially when the order of words is not critical.
    *   **Where to apply:** After tokenization and other cleaning steps.

8.  **TF-IDF (Term Frequency-Inverse Document Frequency):**
    *   **What it is:** Representing text as a vector where the value for each word reflects its importance in a document relative to the entire corpus.
    *   **When to apply:** When you want to give more weight to words that are unique to a document and less weight to common words. Often performs better than BoW for tasks like document retrieval and text classification.
    *   **Where to apply:** After tokenization and other cleaning steps.

9.  **Word Embeddings (e.g., Word2Vec, GloVe, FastText):**
    *   **What it is:** Representing words as dense vectors in a continuous vector space, where words with similar meanings are closer together.
    *   **When to apply:** When you need to capture semantic relationships between words or when using deep learning models for text tasks.
    *   **Where to apply:** Typically applied to the tokenized text, often as an input layer to neural networks.

10. **Sequence Models (e.g., RNNs, LSTMs, Transformers):**
    *   **What it is:** Models that process text as a sequence, taking into account the order of words.
    *   **When to apply:** For tasks where word order is crucial, such as machine translation, text generation, and named entity recognition.
    *   **Where to apply:** These are typically the core of the model architecture itself, operating on tokenized and sometimes embedded sequences.

The choice of which techniques to apply depends heavily on the specific task, the nature of the text data, and the model you plan to use. It's often an iterative process of experimentation to find the best combination.