In [1]:
# -*- coding: utf-8 -*-
"""
Created on Wed May 22 22:18:32 2024

@author: mrsag
"""

import numpy as np
import matplotlib.pyplot as plt
from ucimlrepo import fetch_ucirepo 
import pandas as pd
import glob
from Curve_fitting_with_scipy import Gaussianfitting as Gf
from Curve_fitting_with_scipy import Linefitting as Lf
from scipy.signal import fftconvolve
from collections import defaultdict
import joblib

import matplotlib as mpl

mpl.rcParams['font.family'] = 'serif'
mpl.rcParams['font.serif'] = 'Times New Roman'
mpl.rcParams['font.size'] = 12
mpl.rcParams['font.weight'] = 'bold'
#mpl.rcParams['font.style'] = 'italic'  # Set this to 'italic'
mpl.rcParams['figure.dpi'] = 120  # highres display

In [2]:
def find_index(array, value):
    # Calculate the absolute differences between each element and the target value
    absolute_diff = np.abs(array - value)
    
    # Find the index of the minimum absolute difference
    index = np.argmin(absolute_diff)
    
    return index


def moving_average(signal, window_size):
    # Define the window coefficients for the moving average
    window = np.ones(window_size) / float(window_size)
    
    # Apply the moving average filter using fftconvolve
    filtered_signal = fftconvolve(signal, window, mode='same')
    
    return filtered_signal


def hist_dataframe(df, bins=10):
    # Define a list of colors for each histogram
    colors = ['red', 'green', 'blue', 'magenta', 'cyan', 'purple', 'orange', 'black']
    # Create subplots with a dynamic number of rows, 3 columns per row
    fig, axes = plt.subplots(nrows=int(np.ceil(len(df.columns) / 3)), ncols=3, figsize=(18, 4.5*int(np.ceil(len(df.columns) / 3))))
    # Flatten the axes array for easy iteration (even if it's a 2D grid)
    axes = axes.flatten()
    
    # Plot each histogram individually
    for i, column in enumerate(df.columns):
        df[column].plot(kind='hist', ax=axes[i], color=colors[i%len(colors)], title=column, bins = bins)
        axes[i].grid(True, linewidth=0.5, color='k')  # Optional: add grid
    
    # Turn off any unused subplots (in case the number of columns is not a multiple of 3)
    for j in range(i+1, len(axes)):
        fig.delaxes(axes[j])  # Delete empty subplots
    
    plt.tight_layout()  # Adjust the layout
    plt.show()
    pass
    

def binned_mode(data, num_bins):     
    """ use this function to replace the missing value with the most probable value in a dataset...
        There are inbuilt functions for mean and median"""
    # Calculate the range of the data
    data_min, data_max = min(data), max(data)
    
    # Calculate the bin edges
    bins = np.linspace(data_min, data_max, num_bins + 1)
    
    # Group data into bins
    binned_data = defaultdict(list)
    for num in data:
        # Find the correct bin index for each number
        bin_index = np.digitize(num, bins) - 1  # subtract 1 to get 0-based index
        bin_index = min(bin_index, num_bins - 1)  # ensure last bin is included
        binned_data[bin_index].append(num)
    
    # Find the bin with the highest frequency
    most_frequent_bin = max(binned_data, key=lambda k: len(binned_data[k]))
    
    # Calculate the average of the values in the most frequent bin
    mode_value = np.mean(binned_data[most_frequent_bin])
    
    return mode_value


def plot_hollow_pillar_histogram(data, bins=30, edgecolor='black', linewidth=1.5):   #, xlabel='Value', ylabel='Frequency', title='Histogram with Hollow Pillar Bars'):
    """
    Plots a histogram with hollow pillar bars.

    Parameters:
    - data: Array of data to be plotted.
    - bins: Number of bins or bin edges (default is 30).
    - edgecolor: Color of the bar edges (default is 'black').
    - linewidth: Thickness of the bar edges (default is 1.5).
    - xlabel: Label for the x-axis (default is 'Value').
    - ylabel: Label for the y-axis (default is 'Frequency').
    - title: Title for the plot (default is 'Histogram with Hollow Pillar Bars').
    """
    # Create the histogram without plotting it (retrieve the counts and bin edges)
    counts, bin_edges = np.histogram(data, bins=bins)
    # Width of each bar
    bin_width = bin_edges[1] - bin_edges[0]
    # Create the plot
    for i in range(bins):
        plt.hist(bin_edges[i]*np.ones(counts[i]),bins=1, edgecolor='black', linewidth=0.5, rwidth=(max(data)-min(data))/bins)
        
    # Set limits for x and y axis
    # ax.set_xlim(bin_edges[0], bin_edges[-1])
    # ax.set_ylim(0, max(counts) * 1.1)
    # Add labels and title
    # plt.xlabel(xlabel)
    # plt.ylabel(ylabel)
    # plt.title(title)
    pass


def get_numerical_categorical_boolean_columns(data):
    # Separate categorical and numerical columns
    categorical_columns = []
    numerical_columns = []
    boolean_columns = []
    
    for name in np.array(data.columns):
        i=0
        while(data[name][i] is None):
            i += 1

        if(type(data[name][i]) is str):
            categorical_columns.append(name)
        elif((type(data[name][i]) is float) or (type(data[name][i]) is int) or (type(data[name][i]) is bin) or (type(data[name][i]) is np.int64) 
            or (type(data[name][i]) is np.int32) or (type(data[name][i]) is np.int16) or (type(data[name][i]) is np.int8) or 
            (type(data[name][i]) is np.float16) or (type(data[name][i]) is np.float32) or (type(data[name][i]) is np.float64)):
            
            numerical_columns.append(name)
        elif((type(data[name][i]) is bool)):
             boolean_columns.append(name)
        else:
            pass
    return numerical_columns, categorical_columns, boolean_columns

In [3]:
# fetch dataset 
adult = fetch_ucirepo(id=2) 
  
# data (as pandas dataframes) 
data = adult.frame
x = adult.data.features 
y = adult.data.targets 
data = pd.concat([x,y],axis=1)
del x,y
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             48842 non-null  int64 
 1   workclass       47879 non-null  object
 2   fnlwgt          48842 non-null  int64 
 3   education       48842 non-null  object
 4   education-num   48842 non-null  int64 
 5   marital-status  48842 non-null  object
 6   occupation      47876 non-null  object
 7   relationship    48842 non-null  object
 8   race            48842 non-null  object
 9   sex             48842 non-null  object
 10  capital-gain    48842 non-null  int64 
 11  capital-loss    48842 non-null  int64 
 12  hours-per-week  48842 non-null  int64 
 13  native-country  48568 non-null  object
 14  income          48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


In [4]:
print(set(data["income"]))
# Dictionary to map equivalent gender values
mapping = {
    '<=50K.': '<=50K',
    '<=50K': '<=50K',
    '>50K.': '>50K',
    '>50K.': '>50K'
}

# Replace the values in the 'gender' column
data['income'] = data['income'].replace(mapping)
print(set(data["income"]))

{'>50K', '<=50K.', '>50K.', '<=50K'}
{'>50K', '<=50K'}


In [5]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder

In [6]:
print(type(data["hours-per-week"][0]))

<class 'numpy.int64'>


In [7]:
numerical_columns,categorical_columns,boolean_columns=get_numerical_categorical_boolean_columns(data)

print(f"categorical_columns: {categorical_columns}")
print(f"numerical_columns: {numerical_columns}")
print(f"boolean_columns: {boolean_columns}")

categorical_columns: ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'income']
numerical_columns: ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
boolean_columns: []


In [8]:
# # Numerical: Use median for missing values
# numerical_imputer_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='median')),  # Impute missing values with the mean
#     ('scaler', StandardScaler())  # Optionally, scale the numerical data
# ])

# # Categorical: Use most frequent (mode) or a constant value
# categorical_imputer_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with the mode
#     # ('onehot', OneHotEncoder(handle_unknown='ignore'))  # Convert categorical to one-hot encoding
# ])

# boolean_imputer_transformer = Pipeline(steps=[
#     ('imputer',SimpleImputer(strategy='median')) # Impute missing values with the median
# ])

# # Step 2: Use ColumnTransformer to apply transformations to different columns
# imputer_preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', numerical_imputer_transformer, numerical_columns),
#         ('cat', categorical_imputer_transformer, categorical_columns),
#         ('bool', boolean_imputer_transformer, boolean_columns)
#     ])
# # Step 3: Build and apply the pipeline

# # imputer_pipeline = Pipeline(steps=[
# #     ('imputer preprocessor', imputer_preprocessor)
# #     # ('scalar',StandardScaler())
# #     ])


# # Define the ColumnTransformer to encode the categorical features
# labelencoder_preprocessor = ColumnTransformer(
#     transformers=[
#     ('cat', OrdinalEncoder(), categorical_columns)
#     ])

# # # Step 2: Create the pipeline with the preprocessor
# # labelencoder_pipeline = Pipeline(steps=[('label_encoder_preprocessor', labelecnoder_preprocessor)])

# final_pipeline = Pipeline(steps=[
#     ('imputer_preprocessor',imputer_preprocessor),
#     ('label_encoder_preprocessor',labelencoder_preprocessor)
#     ])

# # Fit the pipeline and transform the data
# transformed_data = final_pipeline.fit_transform(data)

In [9]:
def total_imputer(data):
    imputer_numeric = SimpleImputer(strategy='median')
    imputer_categoric = SimpleImputer(strategy='most_frequent')
    imputer_boolean = SimpleImputer(strategy='most_frequent')
    
    if(len(numerical_columns) != 0):
        data[numerical_columns] = imputer_numeric.fit_transform(data[numerical_columns])
    if(len(categorical_columns) != 0):
        data[categorical_columns] = imputer_categoric.fit_transform(data[categorical_columns])
    if(len(boolean_columns) != 0):
        data[boolean_columns] = imputer_boolean.fit_transform(data[boolean_columns])
    return data

In [10]:
def categorical_to_numerical(data):
    le = LabelEncoder()
    for name in categorical_columns:
        data[name]=le.fit_transform(data[name])
    return data

In [11]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

# Custom imputer for different data types
class TotalImputer(BaseEstimator, TransformerMixin):
    def __init__(self, numerical_columns, categorical_columns, boolean_columns):
        self.numerical_columns = numerical_columns
        self.categorical_columns = categorical_columns
        self.boolean_columns = boolean_columns
        self.imputer_numeric = SimpleImputer(strategy='median')
        self.imputer_categoric = SimpleImputer(strategy='most_frequent')
        self.imputer_boolean = SimpleImputer(strategy='most_frequent')

    def fit(self, X, y=None):
        if len(self.numerical_columns) != 0:
            self.imputer_numeric.fit(X[self.numerical_columns])
        if len(self.categorical_columns) != 0:
            self.imputer_categoric.fit(X[self.categorical_columns])
        if len(self.boolean_columns) != 0:
            self.imputer_boolean.fit(X[self.boolean_columns])
        return self

    def transform(self, X):
        if len(self.numerical_columns) != 0:
            X[self.numerical_columns] = self.imputer_numeric.transform(X[self.numerical_columns])
        if len(self.categorical_columns) != 0:
            X[self.categorical_columns] = self.imputer_categoric.transform(X[self.categorical_columns])
        if len(self.boolean_columns) != 0:
            X[self.boolean_columns] = self.imputer_boolean.transform(X[self.boolean_columns])
        return X



class CategoricalToNumerical(BaseEstimator, TransformerMixin):
    def __init__(self, categorical_columns):
        self.categorical_columns = categorical_columns
        self.label_encoders = {col: LabelEncoder() for col in categorical_columns}

    def fit(self, X, y=None):
        for col in self.categorical_columns:
            self.label_encoders[col].fit(X[col])
        return self

    def transform(self, X):
        for col in self.categorical_columns:
            X[col] = self.label_encoders[col].transform(X[col])
        return X

class NumericalStandardScaler(BaseEstimator, TransformerMixin):
    def __init__(self, numerical_columns):
        self.numerical_columns = numerical_columns
        self.scalers = {col: StandardScaler() for col in numerical_columns}

    def fit(self, X, y=None):
        # Fit the StandardScaler for each numerical column
        for col in self.numerical_columns:
            self.scalers[col].fit(X[[col]])  # X[[col]] is used to maintain column shape
        return self

    def transform(self, X):
        X_copy = X.copy()  # Avoid modifying the original DataFrame
        # Transform each numerical column using the fitted scaler
        for col in self.numerical_columns:
            X_copy[col] = self.scalers[col].transform(X[[col]])
        return X_copy


In [12]:
# Create the pipeline
pipeline = Pipeline([
    ('imputer', TotalImputer(numerical_columns=numerical_columns,
                             categorical_columns=categorical_columns,
                             boolean_columns=boolean_columns)),
    ('categorical_to_numerical', CategoricalToNumerical(categorical_columns=categorical_columns)),
    ('scaler', NumericalStandardScaler(numerical_columns=numerical_columns))  # Optionally scale numerical features if necessary
])

data_final = pipeline.fit_transform(data)

In [13]:
data.describe()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
count,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0
mean,38.643585,3.949306,189664.1,10.28842,10.078089,2.61875,6.77548,1.443287,3.668052,0.668482,1079.067626,87.502314,40.422382,36.968142,0.239282
std,13.71051,1.35747,105604.0,3.874492,2.570973,1.507703,4.151386,1.602151,0.845986,0.470764,7452.019058,403.004552,12.391444,7.270493,0.426649
min,17.0,0.0,12285.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
25%,28.0,4.0,117550.5,9.0,9.0,2.0,3.0,0.0,4.0,0.0,0.0,0.0,40.0,39.0,0.0
50%,37.0,4.0,178144.5,11.0,10.0,2.0,7.0,1.0,4.0,1.0,0.0,0.0,40.0,39.0,0.0
75%,48.0,4.0,237642.0,12.0,12.0,4.0,10.0,3.0,4.0,1.0,0.0,0.0,45.0,39.0,0.0
max,90.0,8.0,1490400.0,15.0,16.0,6.0,14.0,5.0,4.0,1.0,99999.0,4356.0,99.0,41.0,1.0
