# Assignment2 - Supervised Learning flow

# Part 1 - Student details:
* Please write the First Name and last 4 digits of the i.d. for each student. For example:
<pre>Israel 9812</pre>

In [1]:
# student 1: Osher 4814
# student 2: Alon 4694

## Part 2 - Initial Preparations 
You could add as many code cells as needed

In [None]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, make_scorer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

train_data = pd.read_csv('wine_train.csv')
test_data = pd.read_csv('wine_test.csv')

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
test_data.corr()

In [None]:
train_data.describe(include='all')

In [None]:
plt.figure(figsize=(10, 6)) 
sns.boxplot(x='target', y='alcohol', data=train_data)
plt.title('Distribution of Alcohol Levels') 
plt.xlabel('Alcohol Content') 
plt.ylabel('Frequency') 
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.heatmap(train_data.corr(), annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()

## Part 3 - Experiments
You could add as many code cells as needed

In [None]:
X_train = train_data.drop('target', axis=1)  # החלף את 'target' בשם עמודת המטרה
y_train = train_data['target']

X_test = test_data.drop('target', axis=1)  # אם יש עמודת מטרה גם ב-test set, אחרת להשאיר כמו שהוא
y_test = test_data['target']  # אם יש עמודת מטרה גם ב-test set, אחרת להשאיר כמו שהוא

In [None]:
scaler = StandardScaler()
pca = PCA(n_components=7)
scoring = make_scorer(f1_score, average='macro')

pipeline_svc = Pipeline([
    ('scaler', scaler),
    ('pca', pca),
    ('model', SVC())
])

param_grid_svc = {
    'model__C': [0.5, 1, 7],
    'model__kernel': ['linear', 'rbf']
}

grid_search_svc = GridSearchCV(
    estimator=pipeline_svc,
    param_grid=param_grid_svc,
    cv=5,
    scoring=scoring
)
grid_search_svc.fit(X_train, y_train)

print("Best parameters for SVC:", grid_search_svc.best_params_)
print(f"Best cross-validation score for SVC: {grid_search_svc.best_score_:.4f}")
best_model_svc = grid_search_svc.best_estimator_

In [None]:
pipeline_rf = Pipeline([
    ('scaler', scaler),
    ('pca', pca),
    ('model', RandomForestClassifier())
])

param_grid_rf = {
    'model__n_estimators': [50, 100 , 150],
    'model__max_depth': [10, 20, 30],
    'model__min_samples_split': [2, 5]
}

grid_search_rf = GridSearchCV(
    estimator=pipeline_rf,
    param_grid=param_grid_rf,
    cv=5,
    scoring=scoring
)
grid_search_rf.fit(X_train, y_train)

print("Best parameters for Random Forest:", grid_search_rf.best_params_)
print(f"Best cross-validation score for Random Forest: {grid_search_rf.best_score_:.4f}")
best_model_rf = grid_search_rf.best_estimator_

In [None]:
pipeline_knn = Pipeline([
    ('scaler', scaler),
    ('pca', pca),
    ('model', KNeighborsClassifier())
])

param_grid_knn = {
    'model__n_neighbors': [3, 5, 7],
    'model__weights': ['uniform', 'distance']
}

grid_search_knn = GridSearchCV(
    estimator=pipeline_knn,
    param_grid=param_grid_knn,
    cv=5,
    scoring=scoring
)
grid_search_knn.fit(X_train, y_train)

print("Best parameters for KNN:", grid_search_knn.best_params_)
print(f"Best cross-validation score for KNN: {grid_search_knn.best_score_:.4f}")
best_model_knn = grid_search_knn.best_estimator_

In [None]:
best_scores = {
    'Random Forest': grid_search_rf.best_score_,
    'Support Vector Classifier': grid_search_svc.best_score_,
    'K-Nearest Neighbors': grid_search_knn.best_score_
}

best_model_name = max(best_scores, key=best_scores.get)
best_score = best_scores[best_model_name]

print(f"\nBest Overall Model: {best_model_name}")
print(f"Best Cross-Validation Score: {best_score:.4f}")

if best_model_name == 'Random Forest':
    best_estimator = best_model_rf
elif best_model_name == 'Support Vector Classifier':
    best_estimator = best_model_svc
else:
    best_estimator = best_model_knn


## Part 4 - Training 
Use the best combination of feature engineering, model (algorithm and hyperparameters) from the experiment part (part 3)

In [None]:
scaler = StandardScaler()
train_data_copy = train_data.copy()
test_data_copy = test_data.copy()

X_train_scaled = scaler.fit_transform(train_data_copy.drop('target', axis=1))
X_test_scaled = scaler.transform(test_data_copy.drop('target', axis=1))

X_train = X_train_scaled
X_test = X_test_scaled
y_train = train_data["target"]
y_test = test_data["target"]

best_estimator.fit(X_train, y_train)

## Part 5 - Apply on test and show model performance estimation

In [None]:
y_pred = best_estimator.predict(X_test_scaled)

# Calculate and display the evaluation metrics
f1_test = f1_score(y_test, y_pred, average='macro')
accuracy_test = accuracy_score(y_test, y_pred)
classification_rep_test = classification_report(y_test, y_pred)

print(f"F1-macro Score on Test Set: {f1_test}")
print(f"Accuracy on Test Set: {accuracy_test}")
print("\nClassification Report on Test Set:")
print(classification_rep_test)

# Display the first 5 predictions along with actual values
predictions_df = pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_pred
}).head()


print(predictions_df)