# Feature Transformation 

## 1.Feature Scalling 
### 1.1) Standardization

In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns 
df=pd.read_csv('Social_Network_Ads.csv')
# df.sample(5)
df1=df[['Age', 'EstimatedSalary', 'Purchased']]

# TRAIN-TEST SPLIT 

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test=train_test_split(
    df1.drop('Purchased', axis=1), #removes 'Purchased' col from df, leaving only the features (X). 
    df1['Purchased'],      # selects the 'Purchased' column as the target variable (y)
    test_size=0.3,         #means 30% of the data is used for testing, and 70% is used for training.
    random_state=0)        #ensures that the split is reproducible
print(x_train.shape, x_test.shape)


#STANDARD SCALAR 

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()    #computes mean & SD for each feature in the training set and then uses those values to scale both the training and test sets.

#fit the scaler to the train set, it will learn the parameters
scaler.fit(x_train) #it calculate mean&SD in for each feature in x_train

#transform train and test sets (Rem: we train from only x_train but transdorm both)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

## ACCESSMENT OF SCALING 


#A) Crosschecking whether scalling has done or not (is mean=0 & SD=1 then done)
x_train_scaled=pd.DataFrame(x_train_scaled, columns=x_train.columns) #convert array into DF 
x_test_scaled=pd.DataFrame(x_test_scaled, columns=x_test.columns) 
round(x_train_scaled.describe(), 2)


#B) Now visualising the Distribution 
fig, (ax1, ax2) =plt.subplots(ncols=2, figsize=(5,2))
ax1.scatter(x_train['Age'], x_train['EstimatedSalary'])
ax1.set_title('Before Scaling')
ax2.scatter(x_train_scaled['Age'], x_train_scaled['EstimatedSalary'], color='r')
ax2.set_title('After Scaling')
# plt.show()

#C) Creating Density plot: in ord
fig, (ax1, ax2) =plt.subplots(ncols=2, figsize=(5,2))
sns.kdeplot(x_train['Age'], ax=ax1)
sns.kdeplot(x_train['EstimatedSalary'], ax=ax1)
ax1.set_title('Before Scaling')
sns.kdeplot(x_train_scaled['Age'], ax=ax2)
sns.kdeplot(x_train_scaled['EstimatedSalary'], ax=ax2)
ax2.set_title('After Scaling')

#D) Model performance comparison before/after scalling 
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr_scaled = LogisticRegression()

lr.fit(x_train,y_train)    #trained with unscaled value 
lr_scaled.fit(x_train_scaled,y_train) # trained with scaled values

y_pred = lr.predict(x_test)
y_pred_scaled = lr_scaled.predict(x_test_scaled)

from sklearn.metrics import accuracy_score
print("Actual", accuracy_score(y_test, y_pred))
print("Scaled", accuracy_score(y_test, y_pred_scaled))

### 1.2) Normalization

In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns 
df=pd.read_csv('wine.csv')
df.sample(5)
df=df[['Wine',	'Alcohol', 	'Malic.acid']].rename(columns={'Wine':'Wine Class'})

# TRAIN-TEST SPLIT 

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test=train_test_split(
    df.drop('Wine Class', axis=1), #removes 'Wine class' col from df, leaving only the features (X). 
    df['Wine Class'],      # selects the 'Wine Class' column as the target variable (y)
    test_size=0.3,         #means 30% of the data is used for testing, and 70% is used for training.
    random_state=0)        #ensures that the split is reproducible
print(x_train.shape, x_test.shape)


# Normalisarion: MinMaxScaler

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()    #computes Mean & Max for each feature in the training set and then uses those values to scale both the training and test sets.

#fit the scaler to the train set, it will learn the parameters
scaler.fit(x_train) #it calculate mean&SD in for each feature in x_train

#transform train and test sets (Rem: we train from only x_train but transdorm both)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

#ACCESSMENT OF SCALING 


#A) Crosschecking whether scalling has done or not (is mean=0 & SD=1 then done)
x_train_scaled=pd.DataFrame(x_train_scaled, columns=x_train.columns) #convert array into DF 
x_test_scaled=pd.DataFrame(x_test_scaled, columns=x_test.columns) 
round(x_train_scaled.describe(), 2)


#B) Now visualising the Distribution 
fig, (ax1, ax2) =plt.subplots(ncols=2, figsize=(5,2))
ax1.scatter(x_train['Alcohol'], x_train['Malic.acid'],c=y_train )
ax1.set_title('Before Scaling')
ax2.scatter(x_train_scaled['Alcohol'], x_train_scaled['Malic.acid'], c=y_train )
ax2.set_title('After Scaling')
# plt.show()

#C) Creating Density plot: in ord
fig, (ax1, ax2) =plt.subplots(ncols=2, figsize=(5,2))
sns.kdeplot(x_train['Alcohol'], ax=ax1)
sns.kdeplot(x_train['Malic.acid'], ax=ax1)
ax1.set_title('Before Scaling')
sns.kdeplot(x_train_scaled['Alcohol'], ax=ax2)
sns.kdeplot(x_train_scaled['Malic.acid'], ax=ax2)
ax2.set_title('After Scaling')


## 2) Catagorical Data Encoding
### 2.1) Ordinal Encoding 

In [None]:
import pandas as pd 
import numpy as np
df=pd.read_csv("Customer Purchase.csv")
# df.sample(5)
df=df.iloc[:, 3:]
df.sample(5)


#TRAIN TEST SPLIT 
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test =train_test_split(
    df.iloc[:, :2], df.iloc[:, -1], 
    test_size=0.3,
    random_state=0)
print(x_train.shape, x_test.shape)


#ENCODING INPUT COLUMNS
from sklearn.preprocessing import OrdinalEncoder
oe=OrdinalEncoder(categories=[['School', 'UG', 'PG'], [ 'Poor','Average', 'Good']]) #telling the order of ordinal data 
#fit the ordinal encoder to the train set
oe.fit(x_train) #fitting meaning assigning the numerical data instead of ordinal 

#Transform the x_train and x_test 
x_train_encoded=oe.transform(x_train)
x_test_encoded=oe.transform(x_test)
# print(x_train_encoded)

#ENCODING OUTPUT COLUMNS
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
#Fiting the label encoder 
le.fit(y_train)
#transform
y_train_labelencoded=le.transform(y_train)
y_test_labelencoded=le.transform(y_test)
# print(y_test_labelencoded)

### 2.2) One Hot Encoding(OHE)

In [None]:
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

# Load dataset and select relevant columns
df = pd.read_csv('car_details.csv')
df = df[['name', 'km_driven', 'fuel', 'owner', 'selling_price']]
df.sample(5)

# TRAIN TEST SPLIT 
x_train, x_test, y_train, y_test = train_test_split(
    df.iloc[:, 0:4], df.iloc[:, -1], test_size=0.2, random_state=0)

#ONEHOTENCODER
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')  # use 'sparse_output' instead of 'sparse'

# Fit the encoder on the categorical columns of training data
x_train_categorical = ohe.fit_transform(x_train[['fuel', 'owner']])

# Transform the test data's categorical columns
x_test_categorical = ohe.transform(x_test[['fuel', 'owner']])



# # Convert back to DataFrame for easier handling
x_train_categorical = pd.DataFrame(x_train_categorical, columns=ohe.get_feature_names_out(['fuel', 'owner']))
x_train_categorical

# 4. Simultaneous Transformation 
## 4.1) Column transformer 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer

# Load dataset and select relevant columns
df=pd.read_csv("dummy_covid_data.csv")
df.sample(5)

# Split the data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(df.iloc[:, :-1], df.iloc[:, -1], test_size=0.2, random_state=0)

#COLUMN TRANSFORMER
transformer= ColumnTransformer(
    transformers=[
        ('tnf1', OrdinalEncoder(categories=[['mild', 'strong']]), ['cough']),  # Apply StandardScaler to numerical columns
        ('fnf2', OneHotEncoder(sparse_output=False, drop='first'),['gender', 'city'])  # Apply OneHotEncoder to categorical columns
    ]
    , remainder='passthrough'
)
# Fit the transformer into the training dataset
x_train_transform = transformer.fit_transform(x_train)

# transform the test dataset 
x_test_transform=transformer.transform(x_test)
print(x_train_transform.shape, x_test_transform.shape)


## 4.2) Pipeline 

In [None]:
import pandas as pd 
import seaborn as sns 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.tree import DecisionTreeClassifier


#1) DATA Ingestion
df=sns.load_dataset('titanic')
df=df.iloc[:, 0:8]
df.info()
df.isnull().sum()

# #Split Train-Test data split 
x_train, x_test, y_train, y_test=train_test_split(
    df.iloc[:, 1:8],df.iloc[:, 0], test_size=0.2, random_state=0)
x_train
#2) Creating PIPELINE FOR TRANSFORMATION 
#Transformation: 1st define the Individual transformation then execute in sequence through pipeline  
#     1st: Imputation transformation in age & embark ,
#     2nd: OneHotEncoding in Sex and embark 
#     3rd: scalling all the columns 
#     4th: Feature selection 
#     5th: Train the model 

#2.1) IMPUTATION TRANSFORMATION 
trf1=ColumnTransformer([
    ('impute-age', SimpleImputer(), [2]), 
    ('impute-embark', SimpleImputer(strategy='most_frequent'), [6])
], remainder='passthrough')

#2.2) ONE HOT ENCODING 
trf2=ColumnTransformer([
    ('ohe-sex-embark', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), [1, 6])
], remainder='passthrough')

#2.3) Scaling 
trf3=ColumnTransformer([
    ('age-scaling', MinMaxScaler(), slice(0,10)) #scaling all cols after trf1&2
]) 

#2.4) Feature Selections
trf4=SelectKBest(score_func=chi2, k=8) # Selectinf 8/10 best cols 

#2.5) model 
trf5=DecisionTreeClassifier()

#3)CREATE PIPELINE 
pipe=Pipeline([
    ('trf1', trf1),
    ('trf2', trf2),
    ('trf3', trf3),
    ('trf4', trf4),
    ('trf5', trf5),
])

#4) TRAIN the model
pipe.fit(x_train, y_train)

#5) Predict: since model is trained we predict the value 
y_pred= pipe.predict(x_test)
y_pred

#6) CALCULATE ACCURACY 
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)


#7) CROSS VALIDATION using pipeline 
from sklearn.model_selection import cross_val_score
cross_val_score(pipe, x_train, y_train, cv=5, scoring='accuracy').mean()


#8)HYPERTUNING Using GridsearchCV 
params={
    'trf5__max_depth':[1,2,3,4,5, None]
}
from sklearn.model_selection import GridSearchCV
grid=GridSearchCV(pipe, params, cv=5, scoring='accuracy')
grid.fit(x_train, y_train)
print("Best score=",grid.best_score_, "Max_depth=", grid.best_params_)



#9)Exporting the PIPELINE
import pickle
pickle.dump(pipe, open('pipe.pkl', 'wb'))


#10)CODE IN PRODUCTION 
import pickle 
import numpy as np
pipe=pickle.load(open('pipe.pkl', 'rb'))
#Assume user input 
test_input1 = np.array([2, 'male', 31.0, 0, 0, 10.5, 'S'], dtype='object').reshape(1,7)
pipe.predict(test_input1)

# 5) Mathematical Transforms 

### QQ Plot 

In [None]:
import seaborn as sns
import scipy.stats as stats
import matplotlib.pyplot as plt

# Generate some data (e.g., normally distributed data)
df = sns.load_dataset('titanic')#stats.norm.rvs(size=1000)
# Create QQ plot
stats.probplot(df['age'], dist="norm", plot=plt)
plt.show()


### 5.1) Functional Trasformers 
#### 5.1.a) Log Transformer 

In [None]:
import numpy as np
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
#dataset
data = sns.load_dataset('titanic')
# Apply log transformation
log_transformed_data = np.log(data['age'])
#ploting data before and after log transformer 
fig, (ax1, ax2) =plt.subplots(ncols=2, figsize=(5,2))
sns.histplot(data['age'], ax=ax1,  kde=True)
ax1.set_title('Before Transformation')
sns.histplot(log_transformed_data, ax=ax2,  kde=True)
ax2.set_title('After Log Transformation')




#### 5.1.b) Reciprocal  Transformermer 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
# Example data
data = sns.load_dataset('titanic')
# Apply log transformation
reciprocal_transformed_data = 1/(data['age'].dropna())
#ploting data before and after log transformer 
fig, (ax1, ax2) =plt.subplots(ncols=2, figsize=(5,2))
sns.histplot(data['age'].dropna(), ax=ax1, kde=True)
ax1.set_title('Before Transformation')
sns.histplot(log_transformed_data.dropna(),ax=ax2, kde=True)
ax2.set_title('After Reciprocal Transformation')


### 5.1.c) x-Square transform  

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
# Example data
data = sns.load_dataset('titanic')
# Apply x-square transformation
square_transformed_data = (data['age'].dropna())**2
#ploting data before and after log transformer 
fig, (ax1, ax2) =plt.subplots(ncols=2, figsize=(5,2))
sns.histplot(data['age'].dropna(), ax=ax1, kde=True)
ax1.set_title('Before Transformation')
sns.histplot(square_transformed_data.dropna(),ax=ax2, kde=True)
ax2.set_title('After X-square Transformation')


### 5.1.d) Square root transform  

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
# Example data
data = sns.load_dataset('titanic')
# Apply square root transformation
square_root_transformed_data = np.sqrt(data['age'].dropna())
#ploting data before and after square root transformer 
fig, (ax1, ax2) =plt.subplots(ncols=2, figsize=(5,2))
sns.histplot(data['age'].dropna(), ax=ax1, kde=True)
ax1.set_title('Before Transformation')
sns.histplot(square_root_transformed_data.dropna(),ax=ax2, kde=True)
ax2.set_title('After square-root Transformation')


### 5.1.e) Custom transform  

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load example data
data = sns.load_dataset('titanic')

# Custom transformation function
def custom_transform(x, threshold=30):
    if x < threshold:
        return np.sqrt(x)
    else:
        return x ** 2

# Apply custom transformation to the 'age' column (handle missing values)
custom_transformed_data = data['age'].dropna().apply(custom_transform)

# Plotting data before and after custom transformation
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(10, 4))

# Original data (before transformation)
sns.histplot(data['age'].dropna(), ax=ax1, kde=True)
ax1.set_title('Original Age Data')

# Custom transformed data (after transformation)
sns.histplot(custom_transformed_data, ax=ax2, kde=True, color='purple')
ax2.set_title('Custom Transformed Age Data')

plt.tight_layout()
plt.show()

## 5.2) Functional Transform  

In [None]:
import numpy as np 
import pandas as pd 
import scipy.stats as stats
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer


# 1) LOAD DATASET 
df=sns.load_dataset('titanic')
df=df[['age', 'fare', 'survived']]
df.info()

#2.1) Imputation of Age 
df['age'].fillna(df['age'].mean(), inplace=True)
df.isnull().sum() #check for Null values

#2.2) Extract the X and y from df  & DO Train-test split  
x=df.iloc[:, 0:2]
y=df.iloc[:, -1]
x_train, x_test, y_train, y_test=train_test_split(x, y, test_size=0.2, random_state=0)

#2.3) Checking the Normality of the data then Transforming 
#Ploting age 
plt.figure(figsize=(9,2))
plt.subplot(121)
sns.histplot(x_train['age'], kde=True) 
plt.subplot(122)
stats.probplot(x_train['age'], dist='norm', plot=plt)
plt.show()
#ploting fare 
plt.figure(figsize=(9,2))
plt.subplot(121)
sns.histplot(x_train['fare'], kde=True) 
plt.subplot(122)
stats.probplot(x_train['fare'], dist='norm', plot=plt)
plt.show()


#3) BEFORE NORMALIZATION: Fit in the model and check the accuracy 
clf1=LogisticRegression()
clf2=DecisionTreeClassifier()
#traing the model 
clf1.fit(x_train, y_train)
clf2.fit(x_train, y_train)
#prediction 
y_pred1=clf1.predict(x_test)
y_pred2=clf2.predict(x_test)
#calculating the accuracy 
print('LR Accuracy=', accuracy_score(y_test, y_pred1))
print('DT Accuracy=', accuracy_score(y_test, y_pred2))

#4) AFTER NORMALIZATION: Fit in the model and check the accuracy 
trf=FunctionTransformer(func=np.log1p)

x_train_transform=trf.fit_transform(x_train)
x_test_transform=trf.transform(x_test)

clf1=LogisticRegression()
clf2=DecisionTreeClassifier()
#traing the model 
clf1.fit(x_train_transform, y_train)
clf2.fit(x_train_transform, y_train)
#prediction 
y_pred_tranformed1=clf1.predict(x_test_transform)
y_pred_tranformed2=clf2.predict(x_test_transform)
#calculating the accuracy 
print('LR Accuracy=', accuracy_score(y_test, y_pred_tranformed1))
print('DT Accuracy=', accuracy_score(y_test, y_pred_tranformed2))

#5)CROSS VALIDATING THE RESULT FOR CONFIRMATION 
x_transformed=trf.fit_transform(x)
clf1=LogisticRegression()
clf2=DecisionTreeClassifier()
print('LR',np.mean(cross_val_score(clf1, x_transformed, y, scoring='accuracy', cv=15)))
print('DT',np.mean(cross_val_score(clf2, x_transformed, y, scoring='accuracy', cv=10)))


#6) NOT CHecking QQ plot of BEFORE AND AFTER FUNCTIONTRANSFORM
#before transform
plt.figure(figsize=(9,2))
plt.subplot(121)
stats.probplot(x_train['fare'], dist='norm', plot=plt)
#after Transform  
plt.subplot(122)
stats.probplot(x_train_transform['fare'], dist='norm', plot=plt)
plt.show()


## 5.3) Power Transform  

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import scipy.stats as stats
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import PowerTransformer


#Data Ingention 
df=pd.read_csv('concrete.csv')
df.sample(2)
df.info()
# TRAIN-TEST SPLIT
x=df.iloc[:, :-1]
y=df.iloc[:, -1] 
x_train, x_test, y_train, y_test=train_test_split(x, y, test_size=0.2, random_state=0)

#BEFORE Transdormation: Trainig the model, predict, Accuracy 
lr=LinearRegression()
lr.fit(x_train, y_train)
y_pred=lr.predict(x_test)
print('LR accuracy=', r2_score(y_test, y_pred))
print('lr_crossvalidation', np.mean(cross_val_score(lr, x, y, scoring='r2', cv=10)))

#Plotting the QQ plot 
for i in x_train.columns:
    plt.figure(figsize=(6,2))
    plt.subplot(121)
    sns.histplot(x_train[i], kde=True)
    plt.title(i)
    plt.subplot(122)
    stats.probplot(x_train[i], dist='norm', plot=plt)
    plt.title(i)
    plt.show()

#TRANSFORMING 
pt=PowerTransformer(method='box-cox', standardize=True)
x_train_transform= pt.fit_transform(x_train+0.000001)
x_test_transform=pt.transform(x_test+0.000001)

lr=LinearRegression()
lr.fit(x_train_transform, y_train)
y_pred_tranform=lr.predict(x_test_transform)

print('LR accuracy after trans=', r2_score(y_test, y_pred_tranform))
#cross validation after transform
pt=PowerTransformer(method='box-cox', standardize=True)
x_trans=pt.fit_transform(x+0.000001)
print('LR_Trans_cross valid=', np.mean(cross_val_score(lr, x_trans, y, scoring='r2', cv=10)))

#Ploting the graph Before and After Transformation 
x_trans = pd.DataFrame(x_trans, columns=x_train.columns)
for i in x_trans.columns:
    plt.figure(figsize=(6, 2))
    plt.subplot(121)
    sns.histplot(x_train[i], kde=True)
    plt.title(f"Original: {i}")
    plt.subplot(122)
    column_index = x_train.columns.get_loc(i)  # Get the index of the column
    sns.histplot(x_train_transform[:, column_index], kde=True)  # Use index
    plt.show()


# 6) Numerical Encoding 
## 6.1) Binning 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.compose import ColumnTransformer

# Load the dataset
df = sns.load_dataset('titanic')
df = df[['age', 'fare', 'survived']]
df.dropna(inplace=True)

# Split the data into features and target
X = df[['age', 'fare']]  # Features (age and fare)
y = df['survived']       # Target (survived)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Function for discretization
def discretize(bins, strategy):
    kbin_age = KBinsDiscretizer(n_bins=bins, encode='ordinal', strategy=strategy)
    kbin_fare = KBinsDiscretizer(n_bins=bins, encode='ordinal', strategy=strategy)
    
    trf = ColumnTransformer([
        ('age', kbin_age, [0]),
        ('fare', kbin_fare, [1])
    ])
    
    X_trf = trf.fit_transform(X_train)
    
    # Model training and cross-validation
    model = DecisionTreeClassifier()
    cv_score = np.mean(cross_val_score(model, X_trf, y_train, cv=10, scoring='accuracy'))
    print(f"Cross-Validation Accuracy: {cv_score}")
    
    # Plotting
    plt.figure(figsize=(6, 2))
    plt.subplot(121)
    plt.hist(X_train['age'], bins=10, color='blue', alpha=0.7)
    plt.title("Age Before")
    plt.subplot(122)
    plt.hist(X_trf[:, 0], bins=10, color='red', alpha=0.7)
    plt.title("Age After")

    plt.show()
    
    plt.figure(figsize=(6, 2))
    plt.subplot(121)
    plt.hist(X_train['fare'], bins=10, color='blue', alpha=0.7)
    plt.title("Fare Before")

    plt.subplot(122)
    plt.hist(X_trf[:, 1], bins=10, color='red', alpha=0.7)
    plt.title("Fare After")

    plt.show()

# Example usage
discretize(5, 'kmeans')

## 6.2) Binnerization 

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer

# Load and prepare the dataset
df = sns.load_dataset('titanic')
df=df[['age', 'fare', 'sibsp', 'parch', 'survived']]
df.dropna(inplace=True)

# Create a new feature 'family' by combining 'SibSp' and 'Parch'
df['family'] = df['sibsp'] + df['parch']
df.drop(columns=['sibsp', 'parch'], inplace=True)

# Split the data into features (X) and target (y)
X = df.drop(columns=['survived'])
y = df['survived']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


from sklearn.preprocessing import Binarizer

# Apply binarization to the 'family' feature
trf = ColumnTransformer([
    ('bin', Binarizer(copy=False), ['family'])
], remainder='passthrough')

X_train_trf = trf.fit_transform(X_train)
X_test_trf = trf.transform(X_test)

# Display the transformed training data
pd.DataFrame(X_train_trf, columns=['family', 'age', 'fare']).head()

# Train a Decision Tree classifier on the binarized data
clf = DecisionTreeClassifier()
clf.fit(X_train_trf, y_train)

# Predict and evaluate the model
y_pred2 = clf.predict(X_test_trf)
print("Accuracy with binarization:", accuracy_score(y_test, y_pred2))

# Cross-validation with binarized data
X_trf = trf.fit_transform(X)
print("Cross-validation score with binarization:", np.mean(cross_val_score(DecisionTreeClassifier(), X_trf, y, cv=10, scoring='accuracy')))

# 7) MISSING VALUES- Handlings 

### 7.1) Initial insight 
#### 7.1.a) Glance of Mising values 

In [None]:
import pandas as pd
import numpy as np 
df=pd.read_csv('heart_disease.csv')
df.sample(3)
miss_value=df.isnull().sum()
miss_value[miss_value>0]

#### 7.1.b) Proportion of missing values  

In [None]:
import pandas as pd
import numpy as np
data = pd.read_csv('heart_disease.csv')
def missing_values(df, output='summary'):
    #1) Total missing values per column
    miss_val = df.isnull().sum()
    #2) Percentage of missing values per column
    percentage_miss_val = (miss_val / len(df)) * 100
    #3) Create a DataFrame with the results
    miss_val_table = pd.concat([miss_val, percentage_miss_val], axis=1)
    miss_val_table.columns= ['Missing Values','% of Total Values']
    # 5) Sort the table by percentage of missing values in descending order
    sorted_miss_val_table = miss_val_table[miss_val_table.iloc[:, 0] != 0].sort_values('% of Total Values', ascending=False).round(1)
    # retun based on the specific value 
    if output=='miss_value':
        return miss_value
    elif output=='percentage_miss_val':
        return percentage_miss_val
    else :
        return sorted_miss_val_table
# Example usage with your data
missing_data_summary = missing_values(data, output='miss_value')
miss_value

### 7.2) Misingno libray 

In [None]:
import pandas as pd 
import missingno as msno
df=pd.read_csv('heart_disease.csv')
#1) Bar chart 
msno.bar(df)
#2) Matrix chart
msno.matrix(df)
#3) Heat map 
msno.heatmap(df)
#4) dendragram  
msno.dendrogram(df)

# 8) Methods for handling missing values  

## 8.1) Deletion 

In [None]:
import pandas as pd 
import numpy as np

df=pd.read_csv('data_science_job.csv')
#Infomation about missing values
miss_value=df.isnull().sum()
per_miss_values=100*miss_value/len(df)
miss_val_info_table= pd.concat([miss_value, per_miss_values], axis=1).round(1)
miss_val_info_table.columns=['Missing Values','% of Total Values'] 
miss_val_info_table[miss_val_info_table.iloc[:, 0] !=0]


#1) Deletions 
#------ i want to delete those rows in which % of missing value is <5% 
cols_no_miss= []
for var in df.columns:
    percent_missing= df[var].isnull().sum()*100/len(df)
    if 0<percent_missing<5:
        cols_no_miss.append(var)
type(cols_no_miss)
#-------------df with cols having <5% missing data
df_new=df[cols_no_miss]
#-------------dropping the missing value in this cols 
df_new=df_new.dropna()
print(f"Original shape: {df.shape}, shape after deletions: {df_new.shape}")

#---------------Now checking whetehr Data is MCAR:------------

#<<<<<<<---------FOR NUMERICAL VARIABLES----------->>>>>>

# A) Ploting the Histogram 
import matplotlib.pyplot as plt 
fig=plt.figure(figsize=(8,3))
plt.subplot(121)
df['training_hours'].hist(bins=50,density=True, color='r' )
df_new['training_hours'].hist(bins=50, density=True ,color='g' )
#B) Probability graph
plt.subplot(122) 
df['training_hours'].plot.density(color='r' )
df_new['training_hours'].plot.density(color='g')
plt.show()

#<<<<<<<---------FOR CATEGORICAL VARIABLES----------->>>>>>
#RATIO of observation per categories  
temp=pd.concat([
    df['enrolled_university'].value_counts()/len(df), #in original data 
    df_new['enrolled_university'].value_counts()/len(df)  #in new df dat
    
], axis=1)
temp.columns=['original', 'after deletion']
print(temp)


# 8.2) Imputation 

### 8.2.a) Univariate Imputation: (Mean, median, Mode)
##### >> USINNG PANDAS LIBRARY

In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt  

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

df=pd.read_csv('titanic_toy.csv')
#checing the info % of missiness 
df.isnull().mean()

#TRAIN-TEST SPLIT 
x=df.iloc[:, :-1]
y=df.iloc[:, -1]
x_train, x_test, y_train, y_test=train_test_split(x, y, test_size=0.2 , random_state=0)

#Transformation -----USING PANDAS----------- as age and fare have null values
mean_age=x_train['Age'].mean()
median_age=x_train['Age'].median()
mean_fare=x_train['Fare'].mean()
median_fare=x_train['Fare'].median()
# creating X_train and test DF having 4-Imputed cols
x_train['age_mean']=x_train['Age'].fillna(mean_age)
x_train['age_median']=x_train['Age'].fillna(median_age)
x_train['fare_mean']=x_train['Fare'].fillna(mean_fare)
x_train['fare_median']=x_train['Fare'].fillna(median_fare)


#Checking following after imputations 
#a) Change in shapep--check change in variance
print("Original Age var:", x_train['Age'].var(),"\n"
      "      After Mean-Age Imputation var:",x_train['age_mean'].var(),"\n"
      "      After Median-Age Imputation var:",x_train['age_median'].var()
     )
print("Original Fare var:", x_train['Fare'].var(),"\n"
      "      After Mean-Fare Imputation var:", x_train['fare_mean'].var(),"\n"
      "      After Median-Fare Imputation var:", x_train['fare_median'].var()
     )
#b)plotting to check the distribution 
plt.figure(figsize=(10,5))
plt.subplot(121)
x_train['Age'].plot(kind='kde', color='r' )
x_train['age_mean'].plot(kind='kde', color='b' )
x_train['age_median'].plot(kind='kde', color='g' )
plt.legend()

plt.subplot(122)
x_train['Fare'].plot(kind='kde', color='r' )
x_train['fare_mean'].plot(kind='kde', color='b' )
x_train['fare_median'].plot(kind='kde', color='g' )
plt.legend()
plt.show()

#c) Covariance 
x_train.cov()

#d) Draw box plot 
plt.figure(figsize=(10,5))
plt.subplot(121)
x_train[['Age', 'age_median', 'age_median']].boxplot()
plt.subplot(122)
x_train[['Fare', 'fare_mean', 'fare_median']].boxplot()
plt.show()

##### >> USING Scikitlearn

In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt  
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

#Data ingestion 
df=pd.read_csv('titanic_toy.csv')

#train-test split 
x=df.iloc[:, :-1]
y=df.iloc[:, -1]
x_train, x_test, y_train, y_test=train_test_split(x, y, test_size=0.2, random_state=42)

#Using simple imputer
imputer1=SimpleImputer(strategy='mean')
imputer2=SimpleImputer(strategy='median')

trf=ColumnTransformer([
    ('imputer1',imputer1, ['Age']),
    ('imputer2', imputer2, ['Fare'])
    ], remainder='passthrough')

#Fit and transform 
trf.fit(x_train)
x_train_trans=trf.transform(x_train)
x_test_trans=trf.transform(x_test)

#Now Perform the basic check there is np RED FLAG
x_train_trans_df=pd.DataFrame(x_train_trans, columns=x_train.columns)

#a) Change in shapep--check change in variance
print("Original Age var:", x_train['Age'].var(),"\n"
      "After Mean-Age Imputation var:",x_train_trans_df['Age'].var()
     )
print("Original Fare var:", x_train['Fare'].var(),"\n"
      "After Median-Fare Imputation var:", x_train_trans_df['Fare'].var()
      )
#b)plotting to check the distribution 
plt.figure(figsize=(10,5))
plt.subplot(121)
x_train['Age'].plot(kind='kde', color='r')
x_train_trans_df['Age'].plot(kind='kde', color='g' )
plt.legend()

plt.subplot(122)
x_train['Fare'].plot(kind='kde', color='r' )
x_train_trans_df['Fare'].plot(kind='kde', color='g' )
plt.legend()
plt.show()

#c) Draw box plot 
df_box = pd.DataFrame({
    'Original Age': x_train['Age'],
    'Transformed Age': x_train_trans_df['Age'], 
    'Original Fare': x_train['Fare'],
    'Transformed Fare': x_train_trans_df['Fare']
    })
plt.figure(figsize=(10,5))
plt.subplot(121)
df_box[['Original Age', 'Transformed Age']].boxplot()
plt.subplot(122)
df_box[['Original Fare','Transformed Fare']].boxplot()
plt.show()
#c) Covariance 
df_box.cov()

#### 8.2.B) Constants imputations 

In [24]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

# Sample DataFrame
data = {'City': ['New York', 'Los Angeles', np.NaN, 'Chicago', np.NaN]}
df = pd.DataFrame(data)

# Instantiate the SimpleImputer with strategy 'constant' and fill_value 'Unknown'
imputer = SimpleImputer(strategy='constant', fill_value='Unknown')

# Fit and transform the data
df['City'] = imputer.fit_transform(df[['City']]).ravel()

print(df)
# df.isna().sum()


          City
0     New York
1  Los Angeles
2      Unknown
3      Chicago
4      Unknown


In [4]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

# Sample DataFrame
data = {'City': ['New York', 'Los Angeles', np.nan, 'Chicago', np.nan], 
        'income': [100, '99', np.nan, '80', np.nan]}
df = pd.DataFrame(data)

# Instantiate the SimpleImputer with strategy 'constant' and fill_value 'Unknown'
imputer1 = SimpleImputer(strategy='constant', fill_value='Unknown')
imputer2 = SimpleImputer(strategy='constant', fill_value=-9999)

# Fit and transform the data
df['City'] = imputer1.fit_transform(df[['City']]).ravel()
df['income'] = imputer2.fit_transform(df[['income']]).ravel()
print(df)



          City income
0     New York    100
1  Los Angeles     99
2      Unknown  -9999
3      Chicago     80
4      Unknown  -9999
