# Support Vector Machines
You should build a machine learning pipeline using a support vector machine model. In particular, you should do the following:
- Load the `mnist` dataset using [Pandas](https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html). You can find this dataset in the datasets folder.
- Split the dataset into training and test sets using [Scikit-Learn](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html).
- Conduct data exploration, data preprocessing, and feature engineering if necessary.
- Train and test a support vector machine model using [Scikit-Learn](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html).
- Check the documentation to identify the most important hyperparameters, attributes, and methods of the model. Use them in practice.

Importing libraries

In [14]:
import pandas as pd
import sklearn.model_selection
import sklearn.compose
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import sklearn.preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report


Data Collection and Loading


In [38]:
df = pd.read_csv('https://raw.githubusercontent.com/m-mahdavi/teaching/refs/heads/main/datasets/mnist.csv')
df.head()

Unnamed: 0,id,class,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
0,31953,5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,34452,8,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,60897,5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,36953,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1981,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Exploratory Data Analysis (EDA)

In [40]:
print (df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Columns: 786 entries, id to pixel784
dtypes: int64(786)
memory usage: 24.0 MB
None


In [39]:
df.isnull().sum()

Unnamed: 0,0
id,0
class,0
pixel1,0
pixel2,0
pixel3,0
...,...
pixel780,0
pixel781,0
pixel782,0
pixel783,0


In [41]:
df.describe()


Unnamed: 0,id,class,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
count,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,...,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0
mean,34415.17925,4.4395,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.07675,0.01525,0.013,0.0015,0.0,0.0,0.0,0.0,0.0,0.0
std,20508.890104,2.879655,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.616022,0.964495,0.822192,0.094868,0.0,0.0,0.0,0.0,0.0,0.0
min,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,16575.75,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,34435.5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,52111.5,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,69998.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,125.0,61.0,52.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0


Разделение данных на признаки (X) и целевую переменную (y)

In [42]:
# Separate features (X) and target variable (y) for train and test sets
X_train = df_train.drop(columns=['id', 'class'])  # Features for training
y_train = df_train['class']  # Target variable for training

In [43]:
X_train

Unnamed: 0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,pixel10,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
3994,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
423,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2991,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1221,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
506,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1130,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1294,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
860,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3507,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [44]:
y_train

Unnamed: 0,class
3994,1
423,3
2991,3
1221,1
506,0
...,...
1130,7
1294,3
860,9
3507,7


In [45]:
X_test = df_test.drop(columns=['id', 'class'])  # Features for testing
y_test = df_test['class']  # Target variable for testing


In [46]:
X_test

Unnamed: 0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,pixel10,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
555,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3491,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
527,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3925,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2989,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1922,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
865,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3943,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1642,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [47]:
y_test

Unnamed: 0,class
555,2
3491,5
527,9
3925,1
2989,7
...,...
1922,4
865,1
3943,3
1642,6


Splitting Data

In [21]:
X_test = df_test.drop(columns=['id', 'class'])  # Features for testing
y_test = df_test['class']  # Target variable for testing

In [48]:
# Checking the sizes
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_test: {y_test.shape}")

Shape of X_train: (3200, 784)
Shape of X_test: (800, 784)
Shape of y_train: (3200,)
Shape of y_test: (800,)


In [49]:
# Split the dataset into training and test sets
# 80% of the data will be used for training, and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [52]:
# Print head of training and testing sets to verify the split
print(X_train.head())

      pixel1  pixel2  pixel3  pixel4  pixel5  pixel6  pixel7  pixel8  pixel9  \
3994       0       0       0       0       0       0       0       0       0   
423        0       0       0       0       0       0       0       0       0   
2991       0       0       0       0       0       0       0       0       0   
1221       0       0       0       0       0       0       0       0       0   
506        0       0       0       0       0       0       0       0       0   

      pixel10  ...  pixel775  pixel776  pixel777  pixel778  pixel779  \
3994        0  ...         0         0         0         0         0   
423         0  ...         0         0         0         0         0   
2991        0  ...         0         0         0         0         0   
1221        0  ...         0         0         0         0         0   
506         0  ...         0         0         0         0         0   

      pixel780  pixel781  pixel782  pixel783  pixel784  
3994         0         0     

In [53]:
print(y_train.head())

3994    1
423     3
2991    3
1221    1
506     0
Name: class, dtype: int64


In [54]:
print(X_test.head())

      pixel1  pixel2  pixel3  pixel4  pixel5  pixel6  pixel7  pixel8  pixel9  \
555        0       0       0       0       0       0       0       0       0   
3491       0       0       0       0       0       0       0       0       0   
527        0       0       0       0       0       0       0       0       0   
3925       0       0       0       0       0       0       0       0       0   
2989       0       0       0       0       0       0       0       0       0   

      pixel10  ...  pixel775  pixel776  pixel777  pixel778  pixel779  \
555         0  ...         0         0         0         0         0   
3491        0  ...         0         0         0         0         0   
527         0  ...         0         0         0         0         0   
3925        0  ...         0         0         0         0         0   
2989        0  ...         0         0         0         0         0   

      pixel780  pixel781  pixel782  pixel783  pixel784  
555          0         0     

In [55]:
print(y_test.head())

555     2
3491    5
527     9
3925    1
2989    7
Name: class, dtype: int64


In [56]:
# Initialize the SVM model with default hyperparameters
svm_model = SVC(kernel='rbf', random_state=42)

In [57]:
# Train the SVM model
svm_model.fit(X_train, y_train)

In [60]:
# Predict on the test dataset
y_pred = svm_model.predict(X_test)

In [61]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.94


In [62]:
# Display classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99        70
           1       0.95      0.95      0.95       100
           2       0.93      0.88      0.90        73
           3       0.98      0.92      0.95        86
           4       0.90      0.96      0.93        80
           5       0.89      1.00      0.94        64
           6       0.97      0.96      0.96        90
           7       0.97      0.97      0.97        67
           8       0.93      0.91      0.92        94
           9       0.95      0.91      0.93        76

    accuracy                           0.94       800
   macro avg       0.94      0.95      0.94       800
weighted avg       0.94      0.94      0.94       800



In [63]:
# Создаем новую модель с class_weight='balanced'
svm_model_balanced = SVC(C=1.0, kernel='rbf', gamma='scale', class_weight='balanced', random_state=42)

In [64]:
# Обучаем модель на тех же данных
svm_model_balanced.fit(X_train, y_train)

In [65]:
# Предсказания на тестовых данных
y_pred_balanced = svm_model_balanced.predict(X_test)

# Оценка модели
accuracy_balanced = svm_model_balanced.score(X_test, y_test)
print(f"Accuracy with balanced class weights: {accuracy_balanced:.2f}")

Accuracy with balanced class weights: 0.94
