In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.pipeline import make_pipeline

In [2]:
def load_data(file_path):
    """Load the dataset from a CSV file."""
    df = pd.read_csv(file_path)
    return df

In [3]:
def clean_data(df):
    """Clean the dataset by removing missing values and filtering outliers."""
    # Remove rows with missing values
    df = df.dropna()
    
    # Remove outliers (example: removing values outside 2.5th and 97.5th percentiles)
    for column in df.select_dtypes(include=['float64', 'int64']).columns:
        df = df[(df[column] >= df[column].quantile(0.025)) & (df[column] <= df[column].quantile(0.975))]
    
    return df

In [4]:
def analyze_data(df):
    """Perform basic analysis on the dataset."""
    print("Summary Statistics:")
    print(df.describe())
    
    print("\nCorrelation Matrix:")
    print(df.corr())

In [8]:
def visualize_data(df):
    """Visualize the dataset using Matplotlib and Seaborn."""
    # Scatter plot
    plt.figure(figsize=(10, 6))
    sns.scatterplot(data=df, x=df.columns[0], y=df.columns[1])
    plt.title('Scatter Plot')
    plt.xlabel(df.columns[0])
    plt.ylabel(df.columns[1])
    plt.show()
    # Histogram
    plt.figure(figsize=(10, 6))
    df.hist(bins=30, figsize=(10, 6))
    plt.suptitle('Histograms')
    plt.show()
    # Heatmap
    plt.figure(figsize=(10, 6))
    sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
    plt.title('Correlation Heatmap')
    plt.show()

In [9]:
def main(file_path):
    """Main function to run the data processing steps."""
    df = load_data(file_path)
    df = clean_data(df)
    analyze_data(df)
    visualize_data(df)
    

In [10]:
"""Main function to run the data processing steps."""
pipe = make_pipeline(load_data,clean_data,analyze_data,visualize_data)

In [12]:
import pickle

In [18]:
pickle.dump(pipe,open('pipe.pkl','wb'))