In [2]:
import sklearn
import numpy as np
import pandas as pd
print("scikit-learn version:", sklearn.__version__)


scikit-learn version: 1.7.1


In [3]:
data = {
    "Name": ["Alice", "Bob", "Charlie" , "David" , "Eve"],
    "Age": [25, None, 35, 40, None],
    "City": ["New York", "Los Angeles", "Chicago" , "Houston", "Phoenix"],
    "Salary": [70000, 80000, 120000, None, None]

}

df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,City,Salary
0,Alice,25.0,New York,70000.0
1,Bob,,Los Angeles,80000.0
2,Charlie,35.0,Chicago,120000.0
3,David,40.0,Houston,
4,Eve,,Phoenix,


In [4]:
#  Check if any null value in 
df.isnull().sum() # Count of null values in each column

Name      0
Age       2
City      0
Salary    2
dtype: int64

In [5]:
df_drop = df.dropna() # Drop rows with any null values
df_drop

Unnamed: 0,Name,Age,City,Salary
0,Alice,25.0,New York,70000.0
2,Charlie,35.0,Chicago,120000.0


In [10]:
# df_fill = df.fillna({'Age': df['Age'].mean(), 'Salary': df['Salary'].mean()}) # Fill null values with mean
# df_fill

In [9]:
df

Unnamed: 0,Name,Age,City,Salary
0,Alice,25.0,New York,70000.0
1,Bob,,Los Angeles,80000.0
2,Charlie,35.0,Chicago,120000.0
3,David,40.0,Houston,
4,Eve,,Phoenix,


In [13]:
df.isnull().mean() * 100 # Percentage of null values in each column

Name       0.0
Age       40.0
City       0.0
Salary    40.0
dtype: float64

## Encoding  🎆🌏 
(Object  -> int ) converstion

### What is Encoding? 
Encoding is the process of converting categorical data into a numerical format that can be used by machine learning algorithms. This is necessary because most machine learning models require numerical input.

### Why is Encoding Important?
- **Machine Learning Compatibility**: Many algorithms can only work with numerical data.
- **Improved Model Performance**: Proper encoding can lead to better model performance by allowing the algorithm to understand the relationships between categories.

### Types of Encoding
1. **Label Encoding**: Converts each category into a unique integer.
   - Example: `['red', 'green', 'blue']` becomes `[0, 1, 2]`.
   - Use when categories are ordinal (have a meaningful order). 
2. **One-Hot Encoding**: Creates binary columns for each category.
   - Example: `['red', 'green', 'blue']` becomes three columns:
    ```

    red | green | blue
    1   | 0     | 0
    0   | 1     | 0
    0   | 0     | 1
    ```
   - Use when categories are nominal (no meaningful order).
3. **Binary Encoding**: Converts categories into binary code.
   - Example: `['red', 'green', 'blue']` becomes:
    ```
    red   | 00
    green | 01
    blue  | 10
    ```
4. **Frequency Encoding**: Replaces categories with their frequency in the dataset.

5. **Target Encoding**: Replaces categories with the mean of the target variable for that category.



In [13]:

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import pandas as pd

# Import Data
df = pd.read_csv("data/sample_data.csv")  

df.dtypes



Name      object
Gender    object
City      object
Age        int64
Passed      bool
dtype: object

In [15]:
df_label = df.copy()  # Create a copy of the DataFrame for label encoding

# Lable Encoding 🦄
le = LabelEncoder()
df_label["Gender_Encoded"] = le.fit_transform(df_label["Gender"]) #Encode
df_label["Passed_Encoded"] = le.fit_transform(df_label["Passed"]) #Encode

# One-Hot Encoding 🐳
df_one_hotencoded = pd.get_dummies(df_label, columns=["City"], drop_first=True) # One-hot encoding
df_one_hotencoded.head()

Unnamed: 0,Name,Gender,Age,Passed,Gender_Encoded,Passed_Encoded,City_Bangalore,City_Bhilai,City_Chennai,City_Delhi,City_Hyderabad,City_Jaipur,City_Lucknow,City_Mumbai,City_Pune
0,MunnaThanos,Female,18,True,0,1,False,True,False,False,False,False,False,False,False
1,Goti_Badmas,Female,19,True,0,1,False,True,False,False,False,False,False,False,False
2,Aman,Male,24,True,1,1,False,False,True,False,False,False,False,False,False
3,Deepak,Male,30,True,1,1,False,False,False,False,False,False,False,False,True
4,Raj,Male,27,True,1,1,False,False,False,False,False,False,False,True,False


## Feature Scaling  🥐📔
Feature scaling is the process of normalizing or standardizing the range of independent variables or features of data. It is essential for algorithms that compute distances between data points, such as k-nearest neighbors (KNN) and support vector machines (SVM).

### Why is Feature Scaling Important?
- **Improves Model Performance**: Helps algorithms converge faster and improves accuracy.
- **Prevents Dominance**: Ensures that no single feature dominates others due to its scale.
### Types of Feature Scaling
1. **Min-Max Scaling**: Scales features to a fixed range, usually [0
, 1].
   - Formula: `X_scaled = (X - X_min) / (X_max - X_min)`
   - Use when you want to preserve the relationships between features.
2. **Standardization (Z-score Normalization)**: Scales features to have a mean of 0 and a standard deviation of 1.
   - Formula: `X_scaled = (X - mean) / std`
    - Use when features have different units or scales. 
3. **Robust Scaling**: Uses the median and interquartile range to scale features, making it robust to outliers.

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

#! StandScaling 🐬
scaler = StandardScaler()
X_scaled = scaler.fit_transform()

#! MinMax Scaling 🫠
scaler = MinMaxScaler()
X_minmax_scaled = scaler.fit_transform()

In [33]:
import pandas as pd 
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split # Train-Test Split 🐳


data = {
    "StudyHors": [5, 10, 15, 20, 25],
    "TestScore": [50, 60, 70, 80, 90],
}

df = pd.DataFrame(data)

# StandardScaler
standard_scaler = StandardScaler() # Standardization (Z-score normalization)
standard_scaled = standard_scaler.fit_transform(df) # Display the scaled DataFrame

standard_scaled
print(pd.DataFrame(standard_scaled, columns=['StudyHors', "TestScore"]))

#! MinMaxScaler
minmax_scaler = MinMaxScaler()
minmax_scaled = minmax_scaler.fit_transform(df) # Display the scaled DataFrame
minmax_scaled
print("")
print(pd.DataFrame(minmax_scaled, columns=['StudyHors', "TestScore"]))


#! Train-Test Split 🐳
X = df[['StudyHors']]  # Features
y = df[['TestScore']]    # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Split the data into training and testing sets
print("")
print("X_train:\n", X_train)
print("X_test:\n", X_test)
print("y_train:\n", y_train)
print("y_test:\n", y_test)





   StudyHors  TestScore
0  -1.414214  -1.414214
1  -0.707107  -0.707107
2   0.000000   0.000000
3   0.707107   0.707107
4   1.414214   1.414214

   StudyHors  TestScore
0       0.00       0.00
1       0.25       0.25
2       0.50       0.50
3       0.75       0.75
4       1.00       1.00

X_train:
    StudyHors
4         25
2         15
0          5
3         20
X_test:
    StudyHors
1         10
y_train:
    TestScore
4         90
2         70
0         50
3         80
y_test:
    TestScore
1         60


## Supervised Learning 📚🔍
Supervised learning is a type of machine learning where the model is trained on labeled data. The algorithm learns to map input features to output labels, allowing it to make predictions on new, unseen data.

#### Types of Supervised Learning

1. **Classification** 🏷️: The task of predicting a discrete label (e.g., spam detection, image recognition).
2. **Regression** 📈: The task of predicting a continuous value (e.g., house prices, stock prices).

##### Types of Classification Algorithms 

1. **Logistic Regression** ➡️🔢: Used for binary classification tasks.
2. **Decision Trees** 🌳: A tree-like model used for both classification and regression tasks.
3. **K-Nearest Neighbors (KNN)** 👥: A simple, instance-based learning algorithm.


### Model Training ⚙️🤖
- `.fit()` 📚 — Train the model on labeled data (e.g., `X_train`, `y_train`).
- `.predict()` 🔮 — Generate predictions for new/unseen inputs (e.g., `X_test`).
- `.score()` ✅ — Quick built‑in evaluation on given data (when supported).
- `.get_params()` / `.set_params()` 🧩 — Inspect and tune hyperparameters.
- Cross-validation 🔁 — Use to estimate generalization performance.

Workflow:
1. Split data ➗: training vs. testing.
2. Initialize model 🧠: choose an algorithm (e.g., Logistic Regression, KNN).
3. Train 📈: call `.fit(X_train, y_train)`.
4. Evaluate 🧪: compute metrics (e.g., accuracy, RMSE).
5. Predict 🚀: call `.predict(X_test)` for final outputs.




In [13]:
# Linear Regression
from sklearn.linear_model import LinearRegression

X = [[1], [2], [3], [4], [5]]
Y = [20, 30, 40, 80, 90]

model = LinearRegression()
model.fit(X, Y) # Learn from training data

hours = float(input("Enter study hours ⌛: "))

predicted_score = model.predict([[hours]])
print(f"Predicted score for {hours} hours of study: {predicted_score[0]}")




Predicted score for 5.0 hours of study: 90.0


### Classification (Supervised Learning)
#### 1) Logistic Regression 🦢




In [19]:
from sklearn.linear_model import LogisticRegression

X = [[1], [2], [3], [4], [5]]  # hours studied input
Y = [0, 0, 0, 1, 1]  # Binary target variable

model = LogisticRegression()
model.fit(X, Y)

hours = float(input("Enter study hours ⌛: "))
predicted_pass = model.predict([[hours]])

print(f"With {hours} hours of study, the predicted outcome is: {'Pass ✅' if predicted_pass[0] == 1 else 'Fail ❌'}")

With 2.0 hours of study, the predicted outcome is: Fail ❌


### 2) K-Nearest Neighbors (KNN) 🦘
KNN means "the K closest neighbors" and is a simple, yet powerful algorithm used for classification and regression tasks.



In [21]:
from sklearn.neighbors import KNeighborsClassifier

X = [[180,7], [200,7.5], [250,8], [300,9], [350,9.5] , [400,10]]  # weight and size of fruit input
Y = [0, 0, 0, 1, 1, 1]  # passed exam (0=apple, 1=orange)

model = KNeighborsClassifier(n_neighbors=3)

model.fit(X, Y)

weight = float(input("Enter fruit weight (in grams) 🍎🍊: "))
size = float(input("Enter fruit size (in inches) 🍎🍊: "))

prediction = model.predict([[weight, size]])
if prediction[0] == 0:
    print("The fruit is likely an apple 🍎.")
else:
    print("The fruit is likely an orange 🍊.")




The fruit is likely an orange 🍊.


### 3) Decision Tree Classifier 🥸
means of making decisions based on the features of the data. It splits the data into branches to make predictions.


In [1]:
from sklearn.tree import DecisionTreeClassifier

X = [
    [7,2], # Apple
    [8,3], # Apple
    [9,4], # Apple
    [10,5], # Orange
    [11,6], # Orange
    [12,7]  # Orange
]

Y = [0, 0, 0, 1, 1, 1]  # 0=Apple, 1=Orange

model = DecisionTreeClassifier()

model.fit(X, Y)

size = float(input("Enter fruit size (in inches) 🍎🍊: "))
shade = float(input("Enter the color shade: ): "))

prediction = model.predict([[size, shade]])
fruit = "Apple" if prediction[0] == 0 else "Orange"
print(f"The predicted fruit is: {fruit}")


The predicted fruit is: Apple


### Evaluation Metrics 🐬✨

#### 1) Classification Metrics 🏷️

- **Accuracy** 🎯: The ratio of correctly predicted instances to the total instances.
- **Precision** 🧪: The ratio of true positive predictions to the total positive predictions.
- **Recall (Sensitivity)** 🔍: The ratio of true positive predictions to the total actual positives.
- **F1 Score** ⚖️: The harmonic mean of precision and recall.

#### 2) Regression Metrics 📈

- **Mean Absolute Error (MAE)** 📏: The average of absolute differences between predicted and actual values.
- **Mean Squared Error (MSE)** 🧮: The average of squared differences between predicted and actual values.
- **R-squared** 📊: The proportion of variance in the dependent variable that is predictable from the independent variables.





In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# True answers (what actually happend)
y_true = [0, 1, 1, 0, 1,]  # Actual labels

# Model Predications (what it guessed)
y_pred = [0, 1, 0, 0, 1]  # Predicted labels

# Evaluation 🤪
print("Accuracy:", str(accuracy_score(y_true, y_pred) * 100) + "%")
print("Precision:", str(precision_score(y_true, y_pred) * 100) + "%")
print("Recall:", str(recall_score(y_true, y_pred) * 100) + "%")
print("F1 Score:", str(f1_score(y_true, y_pred) * 100) + "%")

Accuracy: 80.0%
Precision: 100.0%
Recall: 66.66666666666666%
F1 Score: 80.0%


### 2) Confusion Matrix 📊

#### Error Metrics in Regression 🧮

1. **MAE (Mean Absolute Error)** 🧾  
    - Measures the average of absolute differences between predicted values and actual values.  
    - Example: If the prediction is off by 2, 3, and 1, the MAE is `(2 + 3 + 1) / 3 = 2`.  
    - **Simple Words**: How far off, on average, are your predictions?

2. **MSE (Mean Squared Error)** 🔢  
    - Measures the average of squared differences between predicted values and actual values.  
    - Example: If the prediction is off by 2, 3, and 1, the MSE is `(2² + 3² + 1²) / 3 = 4.67`.  
    - **Simple Words**: Bigger errors hurt more because they are squared.

3. **RMSE (Root Mean Squared Error)** 📐  
    - The square root of the MSE.  
    - Example: If the MSE is 4.67, the RMSE is `√4.67 ≈ 2.16`.  
    - **Simple Words**: Like MSE, but in the same units as the original data.


In [20]:
from sklearn.metrics import confusion_matrix, mean_absolute_error, mean_squared_error , root_mean_squared_error

y_true = [1, 0, 1, 0 ,1]  # Actual values
y_pred = [1, 1 ,1, 0, 0]  # Predicted values

# Evaluate the model
conf_matrix = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:\n", conf_matrix)

mae = mean_absolute_error(y_true, y_pred)
print("Mean Absolute Error:", str(mae * 100) + "%")

mse = mean_squared_error(y_true, y_pred)
print("Mean Squared Error:", str(mse * 100) + "%")

rmse = root_mean_squared_error(y_true, y_pred)
print("Root Mean Squared Error:", str(rmse * 100) + "%")

Confusion Matrix:
 [[1 1]
 [1 2]]
Mean Absolute Error: 40.0%
Mean Squared Error: 40.0%
Root Mean Squared Error: 63.245553203367585%
