In [None]:
from google.colab import drive
drive.mount('/content/drive')

#import the dataset
import pandas as pd
data = pd.read_csv("/content/drive/My Drive/Colab Notebooks/Project 1/migraine_symptom_classification.csv")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Convert all columns to categorical except 'Age', 'Duration', and 'Frequency'
for col in data.columns:
    if col not in ['Age', 'Duration', 'Frequency']:
        data[col] = data[col].astype('category')

# Check for duplicated rows
has_duplicates = data.duplicated().any()
num_duplicates = data.duplicated().sum()
print(f"Has duplicated rows: {has_duplicates}")
print(f"Number of duplicated rows: {num_duplicates}")

# Drop duplicated rows
data = data.drop_duplicates()
print(f"Number of duplicated rows: {data.duplicated().sum()}")

print(data.shape)

# Check for missing values
num_missing = data.isnull().sum().sum()
print(f"Total missing values: {num_missing}")

Has duplicated rows: True
Number of duplicated rows: 6
Number of duplicated rows: 0
(394, 24)
Total missing values: 0


In [None]:
# Define the mapping
type_mapping = {
    'Typical aura with migraine': 'Migraine with Typical Aura',
    'Typical aura without migraine': 'Migraine with Typical Aura',
    'Migraine without aura': 'Migraine Without Aura',
    'Familial hemiplegic migraine': 'Hemiplegic migraine Variants',
    'Sporadic hemiplegic migraine': 'Hemiplegic migraine Variants',
    'Basilar-type aura': 'Other',
    'Other': 'Other'
}

# Create new column with grouped types
data['Type_grouped'] = data['Type'].map(type_mapping)

# Check the counts of each new group
print(data['Type_grouped'].value_counts())

Type_grouped
Migraine with Typical Aura      261
Migraine Without Aura            60
Hemiplegic migraine Variants     38
Basilar-type aura                35
Name: count, dtype: int64


In [None]:
num_missing = data.isnull().sum().sum()
print(f"Total missing values: {num_missing}")

Total missing values: 0


In [None]:
from sklearn.preprocessing import StandardScaler

# Scale numeric columns only
scaler = StandardScaler()
for col in ['Age', 'Duration', 'Frequency']:
    data[col] = scaler.fit_transform(data[[col]])

In [None]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(data, test_size=0.2, stratify=data['Type_grouped'], random_state=123)

In [None]:
# 1. Identify categorical columns (excluding the target)
categorical_cols = train_data.select_dtypes(include='category').columns.drop('Type')
categorical_cols

Index(['Location', 'Character', 'Intensity', 'Nausea', 'Vomit', 'Phonophobia',
       'Photophobia', 'Visual', 'Sensory', 'Dysphasia', 'Dysarthria',
       'Vertigo', 'Tinnitus', 'Hypoacusis', 'Diplopia', 'Defect', 'Ataxia',
       'Conscience', 'Paresthesia', 'DPF'],
      dtype='object')

In [None]:
# 2. Perform one-hot encoding
train_encoded = pd.get_dummies(train_data, columns=categorical_cols, drop_first=True)
test_encoded = pd.get_dummies(test_data, columns=categorical_cols, drop_first=True)

# 3. Align columns between train and test (adds missing cols to test with 0s)
train_encoded, test_encoded = train_encoded.align(test_encoded, join='left', axis=1, fill_value=0)

In [None]:
from sklearn.preprocessing import LabelEncoder
X_train = train_encoded.drop(columns=['Type', 'Type_grouped'])
X_test = test_encoded.drop(columns=['Type', 'Type_grouped'])

X_train = X_train.astype(int)
X_test = X_test.astype(int)

# Encode the response variable (Type_grouped)
le = LabelEncoder()
y_train = le.fit_transform(train_encoded['Type_grouped'])
y_test = le.transform(test_encoded['Type_grouped'])

y_train = pd.Series(y_train)
y_test = pd.Series(y_test)

In [None]:
print(X_train.head())
print(y_train.head())

     Age  Duration  Frequency  Location_1  Location_2  Character_1  \
279    0         0          0           1           0            1   
4      1         0          0           1           0            1   
51    -1         1          1           1           0            1   
112    0         1          1           1           0            1   
213    0         0          1           1           0            1   

     Character_2  Intensity_1  Intensity_2  Intensity_3  ...  Dysphasia_1  \
279            0            0            1            0  ...            0   
4              0            0            1            0  ...            0   
51             0            0            0            1  ...            0   
112            0            0            1            0  ...            0   
213            0            0            0            1  ...            0   

     Dysarthria_1  Vertigo_1  Tinnitus_1  Hypoacusis_1  Diplopia_1  Defect_1  \
279             0          0        

In [None]:
print(y_train.value_counts(normalize=True))

3    0.663492
2    0.152381
1    0.095238
0    0.088889
Name: proportion, dtype: float64


In [None]:
pip install imbalanced-learn



In [None]:

from imblearn.over_sampling import SMOTE
# Apply SMOTE
sm = SMOTE(random_state=42)
X_resampled, y_resampled = sm.fit_resample(X_train, y_train)

# Print class balance before and after
print("Before SMOTE:", y_train.value_counts(normalize=True))
print("After SMOTE:", pd.Series(y_resampled).value_counts(normalize=True))

Before SMOTE: 3    0.663492
2    0.152381
1    0.095238
0    0.088889
Name: proportion, dtype: float64
After SMOTE: 3    0.25
2    0.25
1    0.25
0    0.25
Name: proportion, dtype: float64


In [None]:

from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Initialize the SVM model (RBF kernel by default)
svm_model = SVC(kernel='rbf', random_state=42)

# Train the model on resampled training data
svm_model.fit(X_resampled, y_resampled)
#Make prediction on the training set
y = svm_model.predict(X_resampled)

# Evaluate the model on training set
print("Confusion Matrix:")
print(confusion_matrix(y_resampled, y))

print("\nClassification Report:")
print(classification_report(y_resampled, y, target_names=le.classes_))

print("\nAccuracy Score:")
print(accuracy_score(y_resampled, y))

# Make predictions on the test set
y_pred = svm_model.predict(X_test)

# Evaluate the model
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

print("\nAccuracy Score:")
print(accuracy_score(y_test, y_pred))

Confusion Matrix:
[[186  11  12   0]
 [  6 187   1  15]
 [  0   0 209   0]
 [  1  30   0 178]]

Classification Report:
                              precision    recall  f1-score   support

           Basilar-type aura       0.96      0.89      0.93       209
Hemiplegic migraine Variants       0.82      0.89      0.86       209
       Migraine Without Aura       0.94      1.00      0.97       209
  Migraine with Typical Aura       0.92      0.85      0.89       209

                    accuracy                           0.91       836
                   macro avg       0.91      0.91      0.91       836
                weighted avg       0.91      0.91      0.91       836


Accuracy Score:
0.9090909090909091

Confusion Matrix:
[[ 7  0  0  0]
 [ 1  3  0  4]
 [ 0  0 12  0]
 [ 2  9  0 41]]

Classification Report:
                              precision    recall  f1-score   support

           Basilar-type aura       0.70      1.00      0.82         7
Hemiplegic migraine Variants       0.

In [None]:

from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score

# Pipeline: scaling + SVM
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC())
])

# Define hyperparameter grid
param_grid = {
    'svm__C': [0.1, 1, 10],
    'svm__kernel': ['linear', 'rbf', 'poly'],
    'svm__gamma': ['scale', 'auto']  # Only used for 'rbf', 'poly'
}

# Grid search with 5-fold CV
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    verbose=2,
    n_jobs=-1
)

# Fit model to resampled training data
grid_search.fit(X_resampled, y_resampled)

# Best parameters and score on validation (cross-validation)
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Accuracy:", grid_search.best_score_)


Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best Parameters: {'svm__C': 10, 'svm__gamma': 'scale', 'svm__kernel': 'rbf'}
Best Cross-Validation Accuracy: 0.8935913886512689


In [None]:

# 1. Predictions on training data
y_train_pred = grid_search.predict(X_resampled)

# 2. Predictions on test data
y_test_pred = grid_search.predict(X_test)

# 3. Training Performance
print("Confusion Matrix:")
print(confusion_matrix(y_resampled, y_train_pred))
print("\n--- Training Performance ---")
print("Training Accuracy:", accuracy_score(y_resampled, y_train_pred))
print("Training Classification Report:\n", classification_report(y_resampled, y_train_pred, target_names=le.classes_))

# 4. Test Performance
print("\n--- Test Performance ---")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print("Test Classification Report:\n", classification_report(y_test, y_test_pred, target_names=le.classes_))

Confusion Matrix:
[[200   0   9   0]
 [  9 195   0   5]
 [  0   1 208   0]
 [  0  28   0 181]]

--- Training Performance ---
Training Accuracy: 0.937799043062201
Training Classification Report:
                               precision    recall  f1-score   support

           Basilar-type aura       0.96      0.96      0.96       209
Hemiplegic migraine Variants       0.87      0.93      0.90       209
       Migraine Without Aura       0.96      1.00      0.98       209
  Migraine with Typical Aura       0.97      0.87      0.92       209

                    accuracy                           0.94       836
                   macro avg       0.94      0.94      0.94       836
                weighted avg       0.94      0.94      0.94       836


--- Test Performance ---
Confusion Matrix:
[[ 7  0  0  0]
 [ 1  3  0  4]
 [ 0  2 10  0]
 [ 1  6  0 45]]
Test Accuracy: 0.8227848101265823
Test Classification Report:
                               precision    recall  f1-score   support

   