In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score


In [2]:
df = pd.read_csv('Air_quality_index.csv')
df1 = df.copy()
df1.head(10)

Unnamed: 0,Latitude,Longitude,POC,State Name,County Name,City Name,Date Local,Month,Day,Event Type,Observation Count,Mean_Ozone_Concentration,Max_Ozone_Value,Hour_Max_Ozone_Concentration,Air_Quality_Index,AQI_Range
0,30.497478,-87.880258,1,Alabama,Baldwin,Fairhope,2022-02-28,2,28,,1,0.038,0.038,14.0,35,Good
1,30.497478,-87.880258,1,Alabama,Baldwin,Fairhope,2022-03-01,3,1,,17,0.037235,0.054,12.0,50,Good
2,30.497478,-87.880258,1,Alabama,Baldwin,Fairhope,2022-03-02,3,2,,17,0.038235,0.055,12.0,51,Moderate
3,30.497478,-87.880258,1,Alabama,Baldwin,Fairhope,2022-03-03,3,3,,9,0.024333,0.043,14.0,40,Good
4,30.497478,-87.880258,1,Alabama,Baldwin,Fairhope,2022-03-04,3,4,,17,0.049647,0.063,10.0,77,Moderate
5,30.497478,-87.880258,1,Alabama,Baldwin,Fairhope,2022-03-05,3,5,,17,0.039353,0.044,9.0,41,Good
6,30.497478,-87.880258,1,Alabama,Baldwin,Fairhope,2022-03-06,3,6,,17,0.029824,0.033,7.0,31,Good
7,30.497478,-87.880258,1,Alabama,Baldwin,Fairhope,2022-03-07,3,7,,17,0.027706,0.029,9.0,27,Good
8,30.497478,-87.880258,1,Alabama,Baldwin,Fairhope,2022-03-08,3,8,,17,0.018118,0.02,11.0,19,Good
9,30.497478,-87.880258,1,Alabama,Baldwin,Fairhope,2022-03-09,3,9,,17,0.019235,0.022,12.0,20,Good


In [3]:
# For simplicity, let's select the features and the target variable
features = ['Mean_Ozone_Concentration', 'Max_Ozone_Value', 'Hour_Max_Ozone_Concentration']
target = 'AQI_Range'

# Train and test a linear SVM model

In [4]:
# Create a DataFrame with selected features and target
data = df1[features + [target]]
data

Unnamed: 0,Mean_Ozone_Concentration,Max_Ozone_Value,Hour_Max_Ozone_Concentration,AQI_Range
0,0.038000,0.038,14.0,Good
1,0.037235,0.054,12.0,Good
2,0.038235,0.055,12.0,Moderate
3,0.024333,0.043,14.0,Good
4,0.049647,0.063,10.0,Moderate
...,...,...,...,...
201004,0.007000,0.013,9.0,Good
201005,0.011471,0.018,11.0,Good
201006,0.008412,0.016,8.0,Good
201007,0.010412,0.017,10.0,Good


In [5]:
# Convert categorical target variable to numerical values
# This step is necessary for SVM classification

data[target] = pd.Categorical(data[target])
data['target_code'] = data[target].cat.codes
data.head(10)
#data.sort_values(by='Air_Quality_Index', ascending=False).head(20)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[target] = pd.Categorical(data[target])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['target_code'] = data[target].cat.codes


Unnamed: 0,Mean_Ozone_Concentration,Max_Ozone_Value,Hour_Max_Ozone_Concentration,AQI_Range,target_code
0,0.038,0.038,14.0,Good,0
1,0.037235,0.054,12.0,Good,0
2,0.038235,0.055,12.0,Moderate,1
3,0.024333,0.043,14.0,Good,0
4,0.049647,0.063,10.0,Moderate,1
5,0.039353,0.044,9.0,Good,0
6,0.029824,0.033,7.0,Good,0
7,0.027706,0.029,9.0,Good,0
8,0.018118,0.02,11.0,Good,0
9,0.019235,0.022,12.0,Good,0


In [6]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data[features], data['target_code'], test_size=0.3, random_state=42)

# a. Training a linear SVM model
linear_svm_model = SVC(kernel='linear')
linear_svm_model.fit(X_train, y_train)


SVC(kernel='linear')

In [7]:
# Make predictions on the test set
linear_svm_predictions = linear_svm_model.predict(X_test)


In [8]:
# Evaluate the linear SVM model
linear_svm_conf_matrix = confusion_matrix(y_test, linear_svm_predictions)
linear_svm_classification_report = classification_report(y_test, linear_svm_predictions)

print("Linear SVM Confusion Matrix:\n", linear_svm_conf_matrix)
print("\nLinear SVM Classification Report:\n", linear_svm_classification_report)
print("Accuracy:", accuracy_score(y_test, linear_svm_predictions))
print("F1 Score:", f1_score(y_test, linear_svm_predictions, average='weighted'))


Linear SVM Confusion Matrix:
 [[52500     0     0     0]
 [ 2726  4468     0     0]
 [   14   592     0     0]
 [    0     2     1     0]]

Linear SVM Classification Report:
               precision    recall  f1-score   support

           0       0.95      1.00      0.97     52500
           1       0.88      0.62      0.73      7194
           2       0.00      0.00      0.00       606
           3       0.00      0.00      0.00         3

    accuracy                           0.94     60303
   macro avg       0.46      0.41      0.43     60303
weighted avg       0.93      0.94      0.94     60303

Accuracy: 0.9446959521085186
F1 Score: 0.935443924282973


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Using Non-linear Kernels

In [9]:
# b. Replicate analyses for another type of non-linear kernel (e.g., RBF kernel)
rbf_svm_model = SVC(kernel='rbf')
rbf_svm_model.fit(X_train, y_train)

# Make predictions on the test set using RBF kernel
rbf_svm_predictions = rbf_svm_model.predict(X_test)


In [10]:
# Evaluate the RBF SVM model
rbf_svm_conf_matrix = confusion_matrix(y_test, rbf_svm_predictions)
rbf_svm_classification_report = classification_report(y_test, rbf_svm_predictions)

print("\nRBF SVM Confusion Matrix:\n", rbf_svm_conf_matrix)
print("\nRBF SVM Classification Report:\n", rbf_svm_classification_report)
print("Accuracy:", accuracy_score(y_test, rbf_svm_predictions))
print("F1 Score:", f1_score(y_test, rbf_svm_predictions, average='weighted'))



RBF SVM Confusion Matrix:
 [[52500     0     0     0]
 [ 7194     0     0     0]
 [  606     0     0     0]
 [    3     0     0     0]]

RBF SVM Classification Report:
               precision    recall  f1-score   support

           0       0.87      1.00      0.93     52500
           1       0.00      0.00      0.00      7194
           2       0.00      0.00      0.00       606
           3       0.00      0.00      0.00         3

    accuracy                           0.87     60303
   macro avg       0.22      0.25      0.23     60303
weighted avg       0.76      0.87      0.81     60303

Accuracy: 0.8706034525645491
F1 Score: 0.8103805973181357


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
# using another type of non-linear kernel - Polynomial
poly_svm_model = SVC(kernel='poly', degree=3)
poly_svm_model.fit(X_train, y_train)

# Make predictions on the test set using RBF kernel
poly_svm_predictions = poly_svm_model.predict(X_test)


In [19]:
# Evaluate the Polynomial SVM model
poly_svm_conf_matrix = confusion_matrix(y_test, poly_svm_predictions)
poly_svm_classification_report = classification_report(y_test, poly_svm_predictions)

print("\nPolynomial SVM Confusion Matrix:\n", poly_svm_conf_matrix)
print("\nPolynomial SVM Classification Report:\n", poly_svm_classification_report)
print("Accuracy:", accuracy_score(y_test, poly_svm_predictions))
print("F1 Score:", f1_score(y_test, poly_svm_predictions, average='weighted'))



Polynomial SVM Confusion Matrix:
 [[52500     0     0     0]
 [ 7194     0     0     0]
 [  604     2     0     0]
 [    1     2     0     0]]

Polynomial SVM Classification Report:
               precision    recall  f1-score   support

           0       0.87      1.00      0.93     52500
           1       0.00      0.00      0.00      7194
           2       0.00      0.00      0.00       606
           3       0.00      0.00      0.00         3

    accuracy                           0.87     60303
   macro avg       0.22      0.25      0.23     60303
weighted avg       0.76      0.87      0.81     60303

Accuracy: 0.8706034525645491
F1 Score: 0.8104093344735118


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Solving Dataset Imbalance Problem

In [13]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

# Use RandomOverSampler to handle class imbalance
oversampler = RandomOverSampler(sampling_strategy='not majority', random_state=42)
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)

# Use RandomUnderSampler to handle class imbalance
undersampler = RandomUnderSampler(sampling_strategy='majority', random_state=42)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train_resampled, y_train_resampled)


In [14]:
# Train an SVM model on the resampled data
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test set
svm_predictions = svm_model.predict(X_test)


In [16]:
# Evaluate the model
conf_matrix = confusion_matrix(y_test, svm_predictions)
classification_rep = classification_report(y_test, svm_predictions)

print("Confusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", classification_rep)
print("Accuracy:", accuracy_score(y_test, svm_predictions))
print("F1 Score:", f1_score(y_test, svm_predictions, average='weighted'))


Confusion Matrix:
 [[48548  3952     0     0]
 [    2  6691   501     0]
 [   11     3   592     0]
 [    0     0     0     3]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.92      0.96     52500
           1       0.63      0.93      0.75      7194
           2       0.54      0.98      0.70       606
           3       1.00      1.00      1.00         3

    accuracy                           0.93     60303
   macro avg       0.79      0.96      0.85     60303
weighted avg       0.95      0.93      0.93     60303

Accuracy: 0.9258909175331244
F1 Score: 0.9329858425723908


In [17]:
# b. Replicate analyses for another type of non-linear kernel (e.g., RBF kernel)
rbf_model = SVC(kernel='rbf')
rbf_model.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test set using RBF kernel
rbf_predictions = rbf_model.predict(X_test)


In [18]:
# Evaluate the RBF SVM model
rbf_conf_matrix = confusion_matrix(y_test, rbf_predictions)
rbf_classification_report = classification_report(y_test, rbf_predictions)

print("\nRBF SVM Confusion Matrix:\n", rbf_conf_matrix)
print("\nRBF SVM Classification Report:\n", rbf_classification_report)
print("Accuracy:", accuracy_score(y_test, rbf_predictions))
print("F1 Score:", f1_score(y_test, rbf_predictions, average='weighted'))



RBF SVM Confusion Matrix:
 [[39617 12827    56     0]
 [  207  5962  1025     0]
 [   13   119   471     3]
 [    0     0     1     2]]

RBF SVM Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.75      0.86     52500
           1       0.32      0.83      0.46      7194
           2       0.30      0.78      0.44       606
           3       0.40      0.67      0.50         3

    accuracy                           0.76     60303
   macro avg       0.50      0.76      0.56     60303
weighted avg       0.91      0.76      0.81     60303

Accuracy: 0.7636767656667164
F1 Score: 0.8059686282294057
