# Box-Cox Transform

 The Box-Cox Transformation is appropriate for non-negative data that does not contain zero values
 It assumes that the data follows normal distribution and applies a power transformation to acheicve normally.
The Box-Cox Transformation is a more rigid transformation compared to the Yeo-Johson Transformation 
 because it requires the data to be strictly positive and cannot handle Zero Values.

In [58]:
import numpy as np 
from sklearn.preprocessing import PowerTransformer
#Generate some random data with a skewd Distribution
data = np.random.gamma(1,2,size = (100,1))
# Instantiate a PowerTransformer object 
pt = PowerTransformer(method = "box-cox")

transformed_data = pt.fit_transform(data)
# Print the original and transformed data to compare
print('Original Data : \n',data[:5])
print('Transformed Data : \n',transformed_data[:5])

Original Data : 
 [[1.37664249]
 [1.15412878]
 [0.13596421]
 [2.78083356]
 [2.6312403 ]]
Transformed Data : 
 [[ 0.033553  ]
 [-0.13293155]
 [-1.52688472]
 [ 0.80594916]
 [ 0.73831291]]


# Yeo-Johnson

 The Yeo-Johnson Transformation is a more flexible transformation that can be applied to both positive and negative data, including zero values.
It also assumes that the data follows a normal distribution, but it applies a slightly different power transformation than the Box-Cox Transformation .
The Yeo-Jhonson transformation is a more robust transformation that can handle a wider range of data types and distributions to the Box-Cox transformation

In [59]:
import numpy as np 
from sklearn.preprocessing import PowerTransformer
#Generate some random data with a skewd Distribution
data = np.random.gamma(1,2,size = (100,1))
# Instantiate a PowerTransformer object
pt = PowerTransformer(method = "yeo-johnson")

transformed_data = pt.fit_transform(data)
# Print the original and transformed data to compare
print('Original Data : \n',data[:5])
print('Transformed Data : \n',transformed_data[:5])

Original Data : 
 [[0.07838973]
 [2.17957103]
 [3.53684205]
 [4.94369986]
 [0.04175615]]
Transformed Data : 
 [[-1.65625582]
 [ 0.22285031]
 [ 0.77543732]
 [ 1.17555474]
 [-1.72162463]]


In [60]:
import numpy as np 
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

#create a synthetic dataset

X= np.random.normal(loc = 100, scale = 10,size = (1000,5))
y= np.random.normal(loc = 100, scale = 10,size = 1000)

# Split the data into training and testing sets
X_train , X_test, y_train, y_test = train_test_split(X,y,test_size= 0.2,random_state = 42)

# Fit and transform using Box-Cox Method
boxcox_transformer = PowerTransformer(method = 'box-cox', standardize = True)
X_train_bc = boxcox_transformer.fit_transform(X_train)
X_test_bc = boxcox_transformer.transform(X_test)


# Fit and transform using Yeo-johnson Method
yeojohnson_transformer = PowerTransformer(method = 'yeo-johnson', standardize = True)
X_train_yj = yeojohnson_transformer.fit_transform(X_train)
X_test_yj = yeojohnson_transformer.transform(X_test)

# Pipeline

In [61]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler , OneHotEncoder
from sklearn.datasets import fetch_california_housing

# Load the california housing dataset
housing = fetch_california_housing(as_frame = True)

# Define the numeric features and categorical features
numeric_features = housing.feature_names[:2]
categorical_features = housing.feature_names[2:]

# Define the Preprocessing pipelines for the numeric features and the categorical features
numeric_transformer = Pipeline(steps = [('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps = [('onehot', OneHotEncoder(handle_unknown = 'ignore'))])

#Use ColumnTransformer to combine the numeric and categorical
preprocessor = ColumnTransformer(
    transformers = [
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])
# Define the pipeline with the preprocessor and the lineqrRegression model
pipeline = Pipeline(steps = [('preprocessor',preprocessor),
                            ('regressor',LinearRegression())])

X = pd.DataFrame(housing.data, columns = housing.feature_names)
y = pd.Series(housing.target)

#Fit pipeline to the data
pipeline.fit(X,y)

#Predict on new data
X_new = X.iloc[:10]
y_pred = pipeline.predict(X_new)
print(y_pred)

[4.52640537 3.58494271 3.5207454  3.41311859 3.42220728 2.69723314
 2.99203473 2.41406864 2.26712729 2.61132465]


In [62]:
import numpy as np 

df = pd.read_csv("C:/Users/HP/Downloads/covid_toy.csv")

df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [63]:

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler , OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split 
    
    
X_train , X_test, y_train ,y_test = train_test_split(df.drop('has_covid',axis = 1),df["has_covid"],test_size = 0.2, random_state = 42)

#Define the columns that need to be preprocessed
categorical_features  = ['gender','city']
numeric_features = ['age','fever']

#create transformers
numeric_transformer = Pipeline(steps = [
    ('imputer',SimpleImputer(strategy = 'mean')),
    ('scaler',StandardScaler())
])
categorical_transformer = Pipeline(steps = [
    ('imputer',SimpleImputer(strategy = 'most_frequent')),
    ('onehot',OneHotEncoder(handle_unknown = 'ignore'))
])

preprocessor = ColumnTransformer(
    transformers =[
    ('num',numeric_transformer, numeric_features),
    ('cat',categorical_transformer, categorical_features)
])
clf = Pipeline(steps = [('preprocessor',preprocessor),
                       ('classifier',LogisticRegression())])
# train the model

clf.fit(X_train , y_train)
    
y_pred = clf.predict(X_test)
print(y_pred)

['No' 'Yes' 'No' 'No' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'No' 'Yes'
 'Yes' 'No' 'No' 'No' 'Yes' 'Yes' 'No']


In [64]:

from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler 
from sklearn.metrics import accuracy_score , classification_report
from sklearn.model_selection import train_test_split 
    
df= pd.read_csv("C:/Users/HP/Downloads/Social_Network_Ads.csv")
df.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [65]:
df = df.drop(columns = ['User ID', 'Gender'])

x = df.drop(columns = ['Purchased'],axis = 1)
y = df['Purchased']


In [66]:
x_train , x_test, y_train ,y_test = train_test_split(x,y,test_size = 0.2, random_state = 42)

pipe = Pipeline([
    ('scaler',StandardScaler()),
    ('pca',PCA(n_components = 2)),
    ('classifier',RandomForestClassifier(n_estimators = 100,random_state = 42))
])

In [67]:
pipe

Pipeline(steps=[('scaler', StandardScaler()), ('pca', PCA(n_components=2)),
                ('classifier', RandomForestClassifier(random_state=42))])

In [68]:
pipe.fit(x_train, y_train)

Pipeline(steps=[('scaler', StandardScaler()), ('pca', PCA(n_components=2)),
                ('classifier', RandomForestClassifier(random_state=42))])

In [69]:
y_pred = pipe.predict(x_test)

In [70]:
acc = accuracy_score(y_test,y_pred)
print(acc)


0.8875
