### AI/ML – Improving Model Performance with Clean Data

**Task 1**: Data Preprocessing for Models

**Objective**: Enhance data quality for better AI/ML outcomes.

**Steps**:
1. Choose a dataset for training an AI/ML model.
2. Identify common data issues like null values, redundant features, or noisydata.
3. Apply preprocessing methods such as imputation, normalization, or feature engineering.

In [None]:

import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

#CreateSampleDatasetforAIMLwithIssues
data={
'feature1':[10,20,np.nan,40,50,60,70,80,90,100,10,20,30,40,np.nan],
'feature2':['A','B','A','C','B','A','C','B','A','C','B',None,'A','C','B'],
'feature3':[1000,2000,1500,np.nan,3000,4000,3500,5000,2500,4500,1000,2000,1500,2500,3000],
'feature4':np.random.normal(50,15,15),
'target':[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]
}
df=pd.DataFrame(data)

#IdentifyNumericalandCategoricalFeatures
numerical_features=['feature1','feature3','feature4']
categorical_features=['feature2']
target_column='target'

#DefinePreprocessingSteps
#NumericalPipeline:ImputationthenScaling
numerical_transformer=Pipeline(steps=[
('imputer',SimpleImputer(strategy='mean')),
('scaler',StandardScaler())
])

#CategoricalPipeline:ImputationthenOneHotEncoding
categorical_transformer=Pipeline(steps=[
('imputer',SimpleImputer(strategy='most_frequent')),
('onehot',OneHotEncoder(handle_unknown='ignore'))
])

#CreateColumnTransformer
preprocessor=ColumnTransformer(
transformers=[
('num',numerical_transformer,numerical_features),
('cat',categorical_transformer,categorical_features)
])

#SplitData(OptionalbutgoodpracticeforML)
X=df.drop(columns=[target_column])
y=df[target_column]
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

#ApplyPreprocessingtoTrainingData
X_train_processed=preprocessor.fit_transform(X_train)

#TransformTestData
X_test_processed=preprocessor.transform(X_test)

#DemonstrateFeatureEngineering(example:creatinganewfeature)
df['feature_engineered']=df['feature1'].fillna(df['feature1'].mean())*df['feature3'].fillna(df['feature3'].mean())
#Youwouldtheninclude'feature_engineered'inyourfeaturlistforpreprocessing

**Task 2**: Evaluate Model Performance

**Objective**: Assess the impact of data quality improvements on model performance.

**Steps**:
1. Train a simple ML model with and without preprocessing.
2. Analyze and compare model performance metrics to evaluate the impact of data quality strategies.

In [None]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score

#RecreateSampleDataset
data={
'feature1':[10,20,np.nan,40,50,60,70,80,90,100,10,20,30,40,np.nan],
'feature2':['A','B','A','C','B','A','C','B','A','C','B',None,'A','C','B'],
'feature3':[1000,2000,1500,np.nan,3000,4000,3500,5000,2500,4500,1000,2000,1500,2500,3000],
'feature4':np.random.normal(50,15,15),
'target':[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]
}
df=pd.DataFrame(data)

numerical_features=['feature1','feature3','feature4']
categorical_features=['feature2']
target_column='target'

X=df.drop(columns=[target_column])
y=df[target_column]
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

#ModelWithoutComprehensivePreprocessing(BasicHandling)
num_transformer_basic=Pipeline(steps=[('imputer',SimpleImputer(strategy='mean'))])
cat_transformer_basic=Pipeline(steps=[('imputer',SimpleImputer(strategy='most_frequent')),('onehot',OneHotEncoder(handle_unknown='ignore'))])
preprocessor_basic=ColumnTransformer(
transformers=[
('num',num_transformer_basic,numerical_features),
('cat',cat_transformer_basic,categorical_features)
])
model_basic_pipeline=Pipeline(steps=[('preprocessor',preprocessor_basic),('classifier',LogisticRegression(random_state=42,solver='liblinear'))])
model_basic_pipeline.fit(X_train,y_train)
y_pred_basic=model_basic_pipeline.predict(X_test)

accuracy_basic=accuracy_score(y_test,y_pred_basic)
precision_basic=precision_score(y_test,y_pred_basic,zero_division=0)
recall_basic=recall_score(y_test,y_pred_basic,zero_division=0)
f1_basic=f1_score(y_test,y_pred_basic,zero_division=0)

#ModelWithComprehensivePreprocessing(frompreviousTask1)
numerical_transformer_comp=Pipeline(steps=[('imputer',SimpleImputer(strategy='mean')),('scaler',StandardScaler())])
categorical_transformer_comp=Pipeline(steps=[('imputer',SimpleImputer(strategy='most_frequent')),('onehot',OneHotEncoder(handle_unknown='ignore'))])
preprocessor_comp=ColumnTransformer(
transformers=[
('num',numerical_transformer_comp,numerical_features),
('cat',categorical_transformer_comp,categorical_features)
])
model_comp_pipeline=Pipeline(steps=[('preprocessor',preprocessor_comp),('classifier',LogisticRegression(random_state=42,solver='liblinear'))])
model_comp_pipeline.fit(X_train,y_train)
y_pred_comp=model_comp_pipeline.predict(X_test)

accuracy_comp=accuracy_score(y_test,y_pred_comp)
precision_comp=precision_score(y_test,y_pred_comp,zero_division=0)
recall_comp=recall_score(y_test,y_pred_comp,zero_division=0)
f1_comp=f1_score(y_test,y_pred_comp,zero_division=0)

print(f"BasicPreprocessing-Accuracy:{accuracy_basic:.4f}Precision:{precision_basic:.4f}Recall:{recall_basic:.4f}F1-Score:{f1_basic:.4f}")
print(f"ComprehensivePreprocessing-Accuracy:{accuracy_comp:.4f}Precision:{precision_comp:.4f}Recall:{recall_comp:.4f}F1-Score:{f1_comp:.4f}")


BasicPreprocessing-Accuracy:0.3333Precision:0.5000Recall:0.5000F1-Score:0.5000
ComprehensivePreprocessing-Accuracy:0.3333Precision:0.0000Recall:0.0000F1-Score:0.0000
