## Wine Data


### Load and Understand the Data

In [14]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score

In [15]:
# Load the dataset
file_path = "winequality-red.csv"
data = pd.read_csv(file_path, delimiter=';')  # Assuming the file is separated by semicolons

# Display the first few rows of the dataset
print(data.head())
print()
# Understand the structure of the dataset
print(data.info())
print( )
print(data.describe())


  fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0   7.4,0.7,0,1.9,0.076,11,34,0.9978,3.51,0.56,9.4,5                                                                                               
1   7.8,0.88,0,2.6,0.098,25,67,0.9968,3.2,0.68,9.8,5                                                                                               
2  7.8,0.76,0.04,2.3,0.092,15,54,0.997,3.26,0.65,...                                                                                               
3  11.2,0.28,0.56,1.9,0.075,17,60,0.998,3.16,0.58...                                                                                               
4   7.4,0.7,0,1.9,0.076,11,34,0.9978,3.51,0.56,9.4,5                                                                                               

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 1 columns):
 #   

### Data Exploration and Preprocessing

In [16]:
# Check for missing values
print(data.isnull().sum())

# Feature engineering (if necessary) - For simplicity, we'll use the features as they are

# Normalize/Standardize the data

scaler = StandardScaler()

# Verify 'quality' is in columns
if 'quality' in data.columns:
    scaled_features = scaler.fit_transform(data.drop('quality', axis=1))
    scaled_data = pd.DataFrame(scaled_features, columns=data.columns[:-1])
    scaled_data['quality'] = data['quality']
    print(scaled_data.head())

fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality    0
dtype: int64


### Split the Data

In [17]:
# Split the dataset into training and test sets
X = scaled_data.drop('quality', axis=1)
y = scaled_data['quality']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)



NameError: name 'scaled_data' is not defined

### Implement Gradient Descent for Linear Regression

In [6]:
import numpy as np

# Add a bias term (column of ones) to the training and test data
X_train_b = np.c_[np.ones((X_train.shape[0], 1)), X_train]
X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]

# Initialize parameters
theta = np.random.randn(X_train_b.shape[1])

# Define the cost function
def compute_cost(X, y, theta):
    m = len(y)
    predictions = X.dot(theta)
    cost = (1/2*m) * np.sum(np.square(predictions - y))
    return cost

# Define the gradient descent function
def gradient_descent(X, y, theta, learning_rate, epochs):
    m = len(y)
    cost_history = np.zeros(epochs)
    
    for epoch in range(epochs):
        gradients = X.T.dot(X.dot(theta) - y) / m
        theta = theta - learning_rate * gradients
        cost_history[epoch] = compute_cost(X, y, theta)
        
    return theta, cost_history

# Set hyperparameters
learning_rate = 0.01
epochs = 1000

# Perform gradient descent
theta, cost_history = gradient_descent(X_train_b, y_train, theta, learning_rate, epochs)

# Print the final parameters
print("Theta:", theta)


NameError: name 'X_train' is not defined

###  Model Evaluation

In [7]:
import matplotlib.pyplot as plt

# Plot the cost function history
plt.plot(range(epochs), cost_history)
plt.xlabel('Epoch')
plt.ylabel('Cost')
plt.title('Cost Function History')
plt.show()

# Predict on the test set
y_pred = X_test_b.dot(theta)

# Evaluate the model
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R^2 Score:", r2)


NameError: name 'epochs' is not defined

### Hyperparameter Tuning

In [8]:
# Example of grid search for learning rates
learning_rates = [0.001, 0.01, 0.1]
best_lr = learning_rates[0]
best_cost = float('inf')

for lr in learning_rates:
    theta, cost_history = gradient_descent(X_train_b, y_train, theta, lr, epochs)
    final_cost = cost_history[-1]
    if final_cost < best_cost:
        best_cost = final_cost
        best_lr = lr

print("Best Learning Rate:", best_lr)


NameError: name 'gradient_descent' is not defined