# Task1: Classify Wine Varieties

## 1. Load the Dataset

In [3]:
from sklearn.datasets import load_wine
import pandas as pd

data = load_wine()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target


## 2. Data Preprocessing and Feature Engineering
* **Check for missing values.**
* **Standardize features.**

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Split data
X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


## 3. Pipeline Creation

In [34]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=2)),
    ('classifier', RandomForestClassifier(random_state=42))
])


## 4. Model Training and Hyperparameter Tuning

In [36]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'pca__n_components': [2, 3, 5],
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 10, 20]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5)
grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")


Best parameters: {'classifier__max_depth': None, 'classifier__n_estimators': 100, 'pca__n_components': 5}


## 5. Model Evaluation

In [37]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = grid_search.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))


Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       1.00      1.00      1.00        14
           2       1.00      1.00      1.00         8

    accuracy                           1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36



# Task 2: Predict California Housing Prices

## 1. Load the Dataset

In [40]:
from sklearn.datasets import fetch_california_housing

data = fetch_california_housing()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target


## 2. Data Preprocessing and Feature Engineering
* **Check for missing values.**
* **Standardize features.**

In [42]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Split data
X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


## 3. Pipeline Creation

In [44]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=2)),
    ('regressor', RandomForestRegressor(random_state=42))
])


## 4. Model Training and Hyperparameter Tuning

In [50]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'pca__n_components': [2, 3, 5],
    'regressor__n_estimators': [50, 100, 200],
    'regressor__max_depth': [None, 10, 20]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5)
grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")


Best parameters: {'pca__n_components': 5, 'regressor__max_depth': 20, 'regressor__n_estimators': 200}


## 5. Model Evaluation

In [51]:
from sklearn.metrics import mean_squared_error, r2_score

y_pred = grid_search.predict(X_test)
print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False)}")
print(f"R^2: {r2_score(y_test, y_pred)}")


RMSE: 0.7443006689547674
R^2: 0.5772438261601467
