<a href="https://colab.research.google.com/github/Tejas9523/DataPreprocessing/blob/main/DataPreprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [32]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score

In [15]:
# Load the Iris dataset
iris = load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['target'] = iris.target
df.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
count,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333,1.0
std,0.828066,0.435866,1.765298,0.762238,0.819232
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


In [17]:
df.dtypes

sepal length (cm)    float64
sepal width (cm)     float64
petal length (cm)    float64
petal width (cm)     float64
target                 int64
dtype: object

In [4]:
# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(df.head())

First few rows of the dataset:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   target  
0       0  
1       0  
2       0  
3       0  
4       0  


In [5]:
# Check for missing values
print("\nMissing values in the dataset:")
print(df.isnull().sum())


Missing values in the dataset:
sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
target               0
dtype: int64


In [6]:
# Encode categorical features (if any)
# In this case, the target is already numerical, but if it were categorical, we'd use LabelEncoder
label_encoder = LabelEncoder()
df['target'] = label_encoder.fit_transform(df['target'])

In [7]:
# Split the data into features and target
X = df.drop(columns=['target'])
y = df['target']

In [8]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [11]:
# Display the preprocessed training data
print("\nPreprocessed training data:")
print(pd.DataFrame(X_train, columns=iris.feature_names).head())


Preprocessed training data:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0          -1.473937          1.203658          -1.562535         -1.312603
1          -0.133071          2.992376          -1.276006         -1.045633
2           1.085898          0.085709           0.385858          0.289218
3          -1.230143          0.756479          -1.218701         -1.312603
4          -1.717731          0.309299          -1.390618         -1.312603


In [12]:
print("\nTraining data labels:")
print(y_train.head())


Training data labels:
22    0
15    0
65    1
11    0
42    0
Name: target, dtype: int64


In [13]:
print("\nPreprocessed test data:")
print(pd.DataFrame(X_test, columns=iris.feature_names).head())


Preprocessed test data:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0           0.354517         -0.585060           0.557775          0.022248
1          -0.133071          1.650837          -1.161395         -1.179118
2           2.304867         -1.032239           1.818500          1.490583
3           0.232620         -0.361470           0.443164          0.422703
4           1.207795         -0.585060           0.615081          0.289218


In [14]:
print("\nTest data labels:")
print(y_test.head())


Test data labels:
73     1
18     0
118    2
78     1
76     1
Name: target, dtype: int64


In [20]:
# Train a Logistic Regression model
log_reg = LogisticRegression(random_state=42, max_iter=200)
log_reg.fit(X_train, y_train)

In [34]:
# Train a Naïve Bayes model
nb = GaussianNB()
nb.fit(X_train, y_train)

In [21]:
# Make predictions on the test set
y_pred = log_reg.predict(X_test)

In [22]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

In [23]:
# Display evaluation results
print("\nModel Accuracy:")
print(accuracy)


Model Accuracy:
1.0


In [24]:
print("\nClassification Report:")
print(classification_rep)


Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



In [25]:
print("\nConfusion Matrix:")
print(conf_matrix)


Confusion Matrix:
[[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]


In [30]:
# Extract True Positives, False Positives, True Negatives, False Negatives
TP = conf_matrix[1, 1]
FP = conf_matrix[0, 1]
TN = conf_matrix[0, 0]
FN = conf_matrix[1, 0]

# Error Rate
error_rate = 1 - accuracy

# Precision
precision = precision_score(y_test, y_pred, average='weighted')

# Recall
recall = recall_score(y_test, y_pred, average='weighted')

print(f"\nTrue Positives (TP): {TP}")
print(f"False Positives (FP): {FP}")
print(f"True Negatives (TN): {TN}")
print(f"False Negatives (FN): {FN}")

print(f"\nAccuracy: {accuracy}")
print(f"Error Rate: {error_rate}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")

Confusion Matrix:
[[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]

True Positives (TP): 9
False Positives (FP): 0
True Negatives (TN): 10
False Negatives (FN): 0

Accuracy: 1.0
Error Rate: 0.0
Precision: 1.0
Recall: 1.0
