# Support Vector Machines
- Vidhish Trivedi (IMT2021055)
- Barath S Narayan (IMT2021529)
- Vikas Kalyanapuram (IMT2021040)

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

## Loading The Data, Feature Engineering

In [2]:
# Load your data
train_data = pd.read_csv('./train.csv')
test_data = pd.read_csv('./test.csv')

train_frequency = train_data['patient_id'].value_counts().to_dict()
test_frequency = test_data['patient_id'].value_counts().to_dict()
frequency = {}

for i in train_frequency:
    frequency[i] = 0
for i in test_frequency:
    frequency[i] = 0

for i in train_frequency:
    frequency[i] += train_frequency[i]
for i in test_frequency:
    frequency[i] += test_frequency[i]

train_data['frequency_pid'] = train_data['patient_id'].map(frequency)
test_data['frequency_pid'] = test_data['patient_id'].map(frequency)

## Preprocessing

In [3]:
# Splitting data into training and validation sets
X = train_data.drop('readmission_id', axis=1)
y = train_data['readmission_id']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Identifying categorical and numerical columns
categorical_cols = X_train.select_dtypes(include=['object']).columns
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Apply preprocessing
X_train = preprocessor.fit_transform(X_train)
X_val = preprocessor.transform(X_val)

## Training Different SVMs (Different Kernels)

### Polynomial Kernel

In [4]:
# SVM with Polynomial Kernel
svm_poly = SVC(kernel='poly', degree=3, random_state=42)
svm_poly.fit(X_train, y_train)
y_pred_poly = svm_poly.predict(X_val)
accuracy_poly = accuracy_score(y_val, y_pred_poly)
print(f'Polynomial Kernel Accuracy: {accuracy_poly}')

Polynomial Kernel Accuracy: 0.7142055025266704


### RBF Kernel

In [4]:
# SVM with RBF Kernel
svm_rbf = SVC(kernel='rbf', random_state=42)
svm_rbf.fit(X_train, y_train)
y_pred_rbf = svm_rbf.predict(X_val)
accuracy_rbf = accuracy_score(y_val, y_pred_rbf)
print(f'RBF Kernel Accuracy: {accuracy_rbf}')

RBF Kernel Accuracy: 0.7127316114542392


## Making Predictions

In [5]:
X_test = preprocessor.transform(test_data)
test_predictions_rbf = svm_rbf.predict(X_test)
output_df = pd.DataFrame({'enc_id': test_data['enc_id'], 'readmission_id': test_predictions_rbf})
output_df.to_csv('./svm-output.csv', index=False)

In [5]:
X_test = preprocessor.transform(test_data)
test_predictions_rbf = svm_poly.predict(X_test)
output_df = pd.DataFrame({'enc_id': test_data['enc_id'], 'readmission_id': test_predictions_rbf})
output_df.to_csv('./svm-output_poly.csv', index=False)