# DTSA 5504: Data Mining Pipeline

## Course Overview and Quick Reference Guide

This notebook serves as a comprehensive overview and quick reference guide for the key concepts, techniques, and implementations covered in this course.

### Course Objectives
- Understanding data mining pipeline components
- Implementing data preprocessing techniques
- Building efficient data mining workflows
- Evaluating and optimizing pipelines

In [None]:
# Import common libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest

# Display settings
%matplotlib inline
plt.style.use('seaborn')
pd.set_option('display.max_columns', None)

## Week 1: Introduction to Data Mining Pipeline

### Key Concepts
- 

### Important Components
- 

### Code Examples

In [None]:
def create_basic_pipeline():
    """Create a basic data mining pipeline"""
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler()),
        ('feature_selection', SelectKBest(k=10))
    ])
    return pipeline

## Week 2: Data Collection and Integration

### Key Concepts
- 

### Important Methods
- 

### Code Examples

In [None]:
def load_and_merge_data(file_paths, join_keys):
    """Load and merge multiple data sources"""
    dataframes = [pd.read_csv(path) for path in file_paths]
    merged_df = dataframes[0]
    
    for i, df in enumerate(dataframes[1:], 1):
        merged_df = pd.merge(
            merged_df,
            df,
            on=join_keys[i-1],
            how='left'
        )
    
    return merged_df

## Week 3: Data Preprocessing

### Key Concepts
- 

### Important Techniques
- 

### Code Examples

In [None]:
def preprocess_data(df):
    """Comprehensive data preprocessing"""
    # Handle missing values
    numeric_features = df.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = df.select_dtypes(include=['object']).columns
    
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])
    
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(drop='first', sparse=False))
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])
    
    return preprocessor

## Week 4: Feature Engineering

### Key Concepts
- 

### Important Methods
- 

### Code Examples

In [None]:
class FeatureEngineer:
    """Feature engineering utilities"""
    def __init__(self):
        self.scaler = StandardScaler()
        
    def create_date_features(self, df, date_column):
        """Extract date features"""
        df[date_column] = pd.to_datetime(df[date_column])
        df[f'{date_column}_year'] = df[date_column].dt.year
        df[f'{date_column}_month'] = df[date_column].dt.month
        df[f'{date_column}_day'] = df[date_column].dt.day
        df[f'{date_column}_dayofweek'] = df[date_column].dt.dayofweek
        return df
    
    def create_interaction_features(self, df, feature1, feature2, operation='multiply'):
        """Create interaction features"""
        if operation == 'multiply':
            df[f'{feature1}_{feature2}_interaction'] = df[feature1] * df[feature2]
        elif operation == 'divide':
            df[f'{feature1}_{feature2}_ratio'] = df[feature1] / df[feature2]
        return df
    
    def create_aggregate_features(self, df, group_col, agg_col, aggs=['mean', 'std', 'min', 'max']):
        """Create aggregate features"""
        agg_df = df.groupby(group_col)[agg_col].agg(aggs).reset_index()
        return pd.merge(df, agg_df, on=group_col, how='left')

## Week 5: Feature Selection

### Key Concepts
- 

### Important Methods
- 

### Code Examples

In [None]:
def select_features(X, y):
    """Comprehensive feature selection"""
    from sklearn.feature_selection import SelectKBest, f_classif, RFE
    from sklearn.ensemble import RandomForestClassifier
    
    # Statistical selection
    k_best = SelectKBest(score_func=f_classif, k=10)
    X_stat = k_best.fit_transform(X, y)
    
    # Recursive feature elimination
    rfe = RFE(estimator=RandomForestClassifier(), n_features_to_select=10)
    X_rfe = rfe.fit_transform(X, y)
    
    # Feature importance
    rf = RandomForestClassifier()
    rf.fit(X, y)
    importance = pd.DataFrame({
        'feature': X.columns,
        'importance': rf.feature_importances_
    }).sort_values('importance', ascending=False)
    
    return {
        'statistical': X_stat,
        'rfe': X_rfe,
        'importance': importance
    }

## Week 6: Pipeline Optimization

### Key Concepts
- 

### Important Techniques
- 

### Code Examples

In [None]:
def optimize_pipeline(pipeline, X, y):
    """Optimize pipeline using grid search"""
    from sklearn.model_selection import GridSearchCV
    
    param_grid = {
        'imputer__strategy': ['mean', 'median'],
        'feature_selection__k': [5, 10, 15],
        'classifier__n_estimators': [100, 200],
        'classifier__max_depth': [None, 10, 20]
    }
    
    grid_search = GridSearchCV(
        pipeline,
        param_grid,
        cv=5,
        n_jobs=-1,
        scoring='accuracy'
    )
    
    grid_search.fit(X, y)
    return grid_search.best_estimator_, grid_search.best_params_

## Week 7: Pipeline Deployment

### Key Concepts
- 

### Important Steps
- 

### Code Examples

In [None]:
def save_pipeline(pipeline, filename):
    """Save pipeline to disk"""
    import joblib
    joblib.dump(pipeline, filename)
    
def load_pipeline(filename):
    """Load pipeline from disk"""
    import joblib
    return joblib.load(filename)

def create_api_endpoint(pipeline):
    """Create FastAPI endpoint for pipeline"""
    from fastapi import FastAPI
    from pydantic import BaseModel
    
    app = FastAPI()
    
    class DataInput(BaseModel):
        features: list
        
    @app.post("/predict")
    def predict(data: DataInput):
        return {"prediction": pipeline.predict([data.features]).tolist()}
    
    return app

## Week 8: Pipeline Monitoring and Maintenance

### Key Concepts
- 

### Important Metrics
- 

### Code Examples

In [None]:
def monitor_pipeline(pipeline, X, y, monitoring_period='1d'):
    """Monitor pipeline performance"""
    from sklearn.metrics import accuracy_score, precision_score, recall_score
    import time
    
    metrics = {
        'timestamp': [],
        'accuracy': [],
        'precision': [],
        'recall': [],
        'latency': []
    }
    
    start_time = time.time()
    predictions = pipeline.predict(X)
    latency = time.time() - start_time
    
    metrics['timestamp'].append(pd.Timestamp.now())
    metrics['accuracy'].append(accuracy_score(y, predictions))
    metrics['precision'].append(precision_score(y, predictions, average='weighted'))
    metrics['recall'].append(recall_score(y, predictions, average='weighted'))
    metrics['latency'].append(latency)
    
    return pd.DataFrame(metrics)

## Additional Resources and References

### Useful Libraries
- Scikit-learn: Machine learning tools
- Pandas: Data manipulation
- FastAPI: API development
- MLflow: Pipeline tracking

### External Links
- Course materials
- Pipeline examples
- Best practices

### Personal Notes
- Key components
- Optimization tips
- Maintenance checklist