In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [2]:
# Load the dataset
df = pd.read_csv('wine.data', header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [3]:
df.columns = ['target', 'alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium',
              'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins',
              'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']
df.head()

Unnamed: 0,target,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [4]:
df_info=df.info()
df_info

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   target                        178 non-null    int64  
 1   alcohol                       178 non-null    float64
 2   malic_acid                    178 non-null    float64
 3   ash                           178 non-null    float64
 4   alcalinity_of_ash             178 non-null    float64
 5   magnesium                     178 non-null    int64  
 6   total_phenols                 178 non-null    float64
 7   flavanoids                    178 non-null    float64
 8   nonflavanoid_phenols          178 non-null    float64
 9   proanthocyanins               178 non-null    float64
 10  color_intensity               178 non-null    float64
 11  hue                           178 non-null    float64
 12  od280/od315_of_diluted_wines  178 non-null    float64
 13  proli

In [5]:
# Check for missing values
missing_values = df.isnull().sum()

# Print the number of missing values in each column
print(missing_values)

target                          0
alcohol                         0
malic_acid                      0
ash                             0
alcalinity_of_ash               0
magnesium                       0
total_phenols                   0
flavanoids                      0
nonflavanoid_phenols            0
proanthocyanins                 0
color_intensity                 0
hue                             0
od280/od315_of_diluted_wines    0
proline                         0
dtype: int64


In [6]:
# Drop rows with any missing values
df = df.dropna()

In [7]:
print(missing_values)

target                          0
alcohol                         0
malic_acid                      0
ash                             0
alcalinity_of_ash               0
magnesium                       0
total_phenols                   0
flavanoids                      0
nonflavanoid_phenols            0
proanthocyanins                 0
color_intensity                 0
hue                             0
od280/od315_of_diluted_wines    0
proline                         0
dtype: int64


In [8]:
# Define the feature set (X) and the target variable (y)
X = df.drop(columns=['target'])
y = df['target']


In [9]:
from sklearn.preprocessing import StandardScaler

# Assuming you have already separated X and y
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# If you prefer to keep it in DataFrame format:
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)


In [10]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled_df, y, test_size=0.2, random_state=42)


In [11]:
import sagemaker
from sagemaker import get_execution_role

# Get SageMaker session and role
role = get_execution_role()
session = sagemaker.Session()
bucket = session.default_bucket()

# Convert training data to DataFrame for easier saving
X_train_df = pd.DataFrame(X_train, columns=X.columns)
y_train_df = pd.DataFrame(y_train)

# Save training data locally
X_train_df.to_csv('train_data.csv', header=False, index=False)
y_train_df.to_csv('train_labels.csv', header=False, index=False)

# Upload to S3
train_data_s3 = session.upload_data('train_data.csv', bucket=bucket, key_prefix='wine')
train_labels_s3 = session.upload_data('train_labels.csv', bucket=bucket, key_prefix='wine')


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [12]:
from sagemaker.estimator import Estimator
import sagemaker

# Define the location of the XGBoost container
container = sagemaker.image_uris.retrieve('xgboost', 
                                          session.boto_region_name, 
                                          version='1.3-1')

# Set up the estimator
xgb = Estimator(image_uri=container,
                role=role,
                instance_count=1,
                instance_type='ml.m5.large',
                output_path=f's3://{bucket}/output/',
                sagemaker_session=session)
# Set hyperparameters for XGBoost
xgb.set_hyperparameters(objective="multi:softmax", 
                        num_class=3,   # Number of classes in the dataset
                        num_round=100) # Number of boosting rounds


In [14]:

# Training on X and y as a combined dataset can be done by merging them.
train_input = pd.concat([y_train_df, X_train_df], axis=1)
train_input.to_csv('train_combined.csv', header=False, index=False)

# Inspect the first few rows to make sure the formatting is correct
print(train_input.head())


     target   alcohol  malic_acid       ash  alcalinity_of_ash  magnesium  \
158       3  1.654492   -0.589180  1.218995           1.653086  -0.122282   
137       3 -0.581338    2.848870  0.999674           1.653086  -0.262708   
98        2 -0.778980   -1.136754 -0.974210          -0.298767  -0.824415   
159       3  0.592164   -0.598156  0.999674           0.902373  -0.754202   
38        1  0.085705   -0.750759 -0.974210          -1.199622  -0.122282   

     total_phenols  flavanoids  nonflavanoid_phenols  proanthocyanins  \
158       0.808997   -0.722123              1.354888         1.943238   
137      -0.809357   -1.434939              2.160669        -0.860096   
98        1.962676    1.727556             -0.981875         0.629175   
159       0.488531   -0.932956              1.274310         1.224884   
38        0.168065    0.613153             -0.659563        -0.387033   

     color_intensity       hue  od280/od315_of_diluted_wines   proline  
158         3.435432 -1.6

In [19]:
# Adjust the labels to start from 0 instead of 1
y_train_adjusted = y_train_df - 1
y_train_adjusted = y_train_adjusted.values.ravel()  # Flatten the array

# Adjust labels for the test set as well (for evaluation later)
y_test_adjusted = y_test - 1


In [20]:
from sklearn.datasets import dump_svmlight_file

# Convert adjusted labels and features to LIBSVM format
dump_svmlight_file(X_train, y_train_adjusted, 'train_combined.libsvm', zero_based=True)

# Upload to S3
train_combined_s3 = session.upload_data('train_combined.libsvm', bucket=bucket, key_prefix='wine')


In [21]:
# Train the model using the LIBSVM file
xgb.fit({'train': train_combined_s3})


INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-08-14-11-08-13-716


2024-08-14 11:08:13 Starting - Starting the training job...
2024-08-14 11:08:32 Starting - Preparing the instances for training...
2024-08-14 11:08:54 Downloading - Downloading input data...
2024-08-14 11:09:34 Downloading - Downloading the training image.....[34m[2024-08-14 11:10:18.686 ip-10-0-119-31.ap-south-1.compute.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2024-08-14 11:10:18.718 ip-10-0-119-31.ap-south-1.compute.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2024-08-14:11:10:18:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2024-08-14:11:10:18:INFO] Failed to parse hyperparameter objective value multi:softmax to Json.[0m
[34mReturning the value itself[0m
[34m[2024-08-14:11:10:18:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2024-08-14:11:10:18:INFO] Running XGBoost Sagemaker in algorithm mode[0m
[34m[2024-08-14:11:10:18:INFO] files path: /opt/ml/input/data/trai

In [22]:
# Deploy the model as a Serverless Inference Endpoint
predictor = xgb.deploy(
    initial_instance_count=1,
    instance_type='ml.m5.large',
    endpoint_name='wine-predictor'
)


INFO:sagemaker:Creating model with name: sagemaker-xgboost-2024-08-14-11-15-01-909
INFO:sagemaker:Creating endpoint-config with name wine-predictor
INFO:sagemaker:Creating endpoint with name wine-predictor


------!

In [None]:
# Prepare some test data
test_samples = X_test[:5]  # Get the first 5 samples from the test set

# Predict using the deployed model
predictions = predictor.predict(test_samples)

# Print out the predictions
print(predictions)


In [None]:
from sklearn.metrics import accuracy_score

# Adjust y_test to match the adjusted labels in the training set
y_test_adjusted = y_test - 1  # Subtract 1 to match the training adjustment

# Evaluate the accuracy
accuracy = accuracy_score(y_test_adjusted[:5], predictions)
print(f'Accuracy: {accuracy * 100:.2f}%')


In [None]:
# Delete the endpoint
predictor.delete_endpoint()
