In [1]:
from google.colab import drive
drive.mount('/content/drive')

#import the dataset
import pandas as pd
data = pd.read_csv("/content/drive/My Drive/Colab Notebooks/Project 1/migraine_symptom_classification.csv")

Mounted at /content/drive


In [2]:
# Convert all columns to categorical except 'Age', 'Duration', and 'Frequency'
for col in data.columns:
    if col not in ['Age', 'Duration', 'Frequency']:
        data[col] = data[col].astype('category')

# Check for duplicated rows
has_duplicates = data.duplicated().any()
num_duplicates = data.duplicated().sum()
print(f"Has duplicated rows: {has_duplicates}")
print(f"Number of duplicated rows: {num_duplicates}")

# Drop duplicated rows
data = data.drop_duplicates()
print(f"Number of duplicated rows: {data.duplicated().sum()}")

print(data.shape)

# Check for missing values
num_missing = data.isnull().sum().sum()
print(f"Total missing values: {num_missing}")

Has duplicated rows: True
Number of duplicated rows: 6
Number of duplicated rows: 0
(394, 24)
Total missing values: 0


In [3]:
# Define the mapping
type_mapping = {
    'Typical aura with migraine': 'Migraine with Typical Aura',
    'Typical aura without migraine': 'Migraine with Typical Aura',
    'Migraine without aura': 'Migraine Without Aura',
    'Familial hemiplegic migraine': 'Hemiplegic migraine Variants',
    'Sporadic hemiplegic migraine': 'Hemiplegic migraine Variants',
    'Basilar-type aura': 'Other',
    'Other': 'Other'
}

# Create new column with grouped types
data['Type_grouped'] = data['Type'].map(type_mapping)

# Check the counts of each new group
print(data['Type_grouped'].value_counts())

Type_grouped
Migraine with Typical Aura      261
Migraine Without Aura            60
Hemiplegic migraine Variants     38
Other                            35
Name: count, dtype: int64


In [4]:
num_missing = data.isnull().sum().sum()
print(f"Total missing values: {num_missing}")

Total missing values: 0


In [5]:
from sklearn.preprocessing import StandardScaler

# Scale numeric columns only
scaler = StandardScaler()
for col in ['Age', 'Duration', 'Frequency']:
    data[col] = scaler.fit_transform(data[[col]])

In [6]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(data, test_size=0.2, stratify=data['Type_grouped'], random_state=123)

In [7]:
# 1. Identify categorical columns (excluding the target)
categorical_cols = train_data.select_dtypes(include='category').columns.drop('Type')
categorical_cols

Index(['Location', 'Character', 'Intensity', 'Nausea', 'Vomit', 'Phonophobia',
       'Photophobia', 'Visual', 'Sensory', 'Dysphasia', 'Dysarthria',
       'Vertigo', 'Tinnitus', 'Hypoacusis', 'Diplopia', 'Defect', 'Ataxia',
       'Conscience', 'Paresthesia', 'DPF'],
      dtype='object')

In [8]:
# 2. Perform one-hot encoding
train_encoded = pd.get_dummies(train_data, columns=categorical_cols, drop_first=True)
test_encoded = pd.get_dummies(test_data, columns=categorical_cols, drop_first=True)

# 3. Align columns between train and test (adds missing cols to test with 0s)
train_encoded, test_encoded = train_encoded.align(test_encoded, join='left', axis=1, fill_value=0)

In [9]:
from sklearn.preprocessing import LabelEncoder
X_train = train_encoded.drop(columns=['Type', 'Type_grouped'])
X_test = test_encoded.drop(columns=['Type', 'Type_grouped'])

X_train = X_train.astype(int)
X_test = X_test.astype(int)

# Encode the response variable (Type_grouped)
le = LabelEncoder()
y_train = le.fit_transform(train_encoded['Type_grouped'])
y_test = le.transform(test_encoded['Type_grouped'])

y_train = pd.Series(y_train)
y_test = pd.Series(y_test)

In [10]:
pip install imbalanced-learn



In [11]:
from imblearn.over_sampling import SMOTE
# Apply SMOTE
sm = SMOTE(random_state=42)
X_resampled, y_resampled = sm.fit_resample(X_train, y_train)

# Print class balance before and after
print("Before SMOTE:", y_train.value_counts(normalize=True))
print("After SMOTE:", pd.Series(y_resampled).value_counts(normalize=True))

Before SMOTE: 2    0.663492
1    0.152381
0    0.095238
3    0.088889
Name: proportion, dtype: float64
After SMOTE: 1    0.25
2    0.25
0    0.25
3    0.25
Name: proportion, dtype: float64


In [12]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Initialize KNN (choose number of neighbors, e.g., k=5)
knn = KNeighborsClassifier(n_neighbors=5)

# Train the model
knn.fit(X_resampled, y_resampled)

y = knn.predict(X_resampled)

# Evaluate the model on training set
print("Confusion Matrix:")
print(confusion_matrix(y_resampled, y))

print("\nClassification Report:")
print(classification_report(y_resampled, y, target_names=le.classes_))

print("\nAccuracy Score:")
print(accuracy_score(y_resampled, y))

# Predict on test data
y_pred = knn.predict(X_test)

# Evaluate performance
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))
print("\nAccuracy:", accuracy_score(y_test, y_pred))

Confusion Matrix:
[[192   1  11   5]
 [  0 208   0   1]
 [ 33   9 162   5]
 [  4  16   0 189]]

Classification Report:
                              precision    recall  f1-score   support

Hemiplegic migraine Variants       0.84      0.92      0.88       209
       Migraine Without Aura       0.89      1.00      0.94       209
  Migraine with Typical Aura       0.94      0.78      0.85       209
                       Other       0.94      0.90      0.92       209

                    accuracy                           0.90       836
                   macro avg       0.90      0.90      0.90       836
                weighted avg       0.90      0.90      0.90       836


Accuracy Score:
0.8983253588516746

Confusion Matrix:
 [[ 4  0  4  0]
 [ 0 12  0  0]
 [10  1 40  1]
 [ 2  0  1  4]]

Classification Report:
                               precision    recall  f1-score   support

Hemiplegic migraine Variants       0.25      0.50      0.33         8
       Migraine Without Aura       

In [15]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

param_grid = {'n_neighbors': list(range(1, 21))}
knn = KNeighborsClassifier()

grid = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy')
grid.fit(X_resampled, y_resampled)

print("Best k:", grid.best_params_)
print("Best CV accuracy:", grid.best_score_)

Best k: {'n_neighbors': 3}
Best CV accuracy: 0.8468705446250355


In [16]:
# 1. Predictions on training data
y_train_pred = grid.predict(X_resampled)

# 2. Predictions on test data
y_test_pred = grid.predict(X_test)

# 3. Training Performance
print("Confusion Matrix:")
print(confusion_matrix(y_resampled, y_train_pred))
print("\n--- Training Performance ---")
print("Training Accuracy:", accuracy_score(y_resampled, y_train_pred))
print("Training Classification Report:\n", classification_report(y_resampled, y_train_pred, target_names=le.classes_))

# 4. Test Performance
print("\n--- Test Performance ---")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print("Test Classification Report:\n", classification_report(y_test, y_test_pred, target_names=le.classes_))

Confusion Matrix:
[[191   1  12   5]
 [  0 189   0  20]
 [ 27   3 178   1]
 [  3   9   4 193]]

--- Training Performance ---
Training Accuracy: 0.8983253588516746
Training Classification Report:
                               precision    recall  f1-score   support

Hemiplegic migraine Variants       0.86      0.91      0.89       209
       Migraine Without Aura       0.94      0.90      0.92       209
  Migraine with Typical Aura       0.92      0.85      0.88       209
                       Other       0.88      0.92      0.90       209

                    accuracy                           0.90       836
                   macro avg       0.90      0.90      0.90       836
                weighted avg       0.90      0.90      0.90       836


--- Test Performance ---
Confusion Matrix:
[[ 4  0  4  0]
 [ 0 12  0  0]
 [ 8  2 41  1]
 [ 3  0  0  4]]
Test Accuracy: 0.7721518987341772
Test Classification Report:
                               precision    recall  f1-score   support

He