In [8]:
# Import library
import os
import pandas as pd
import glob

# Studio Activity 1: Data preparation

## Find list csv files in the directory

In [9]:
path = '/Users/phongporter/Documents/GITHUB/COS40007/week 3/ampc'
extension = 'csv'

os.chdir(path)
result = glob.glob('*.{}'.format(extension))
print(result)

['w4.csv', 'w1.csv', 'w3.csv', 'w2.csv']


In [10]:
# Read the path
filePath = "/Users/phongporter/Documents/GITHUB/COS40007/week 3/ampc"

# List all files from the directory
fileList = os.listdir(filePath)
print(fileList)

['w4.csv', 'w1.csv', 'w3.csv', 'w2.csv']


In [11]:
# Print current directory
print(os.getcwd())

/Users/phongporter/Documents/GITHUB/COS40007/week 3/ampc


## Combine 4 csv files into a single CSV file

In [12]:
# Output file name
# outputFile = './dataset/'

# Create an empty DataFrame to store merged data
merged_data = pd.DataFrame()

# Merge CSV files
for filename in fileList:
    df = pd.read_csv(filename)
    merged_data = pd.concat([merged_data, df], ignore_index=True)

# Save the merged DataFrame to a new CSV file
os.chdir(".././dataset")
merged_data.to_csv('combined_data.csv', index=False)


## Shuffle the data and save in another CSV file

In [13]:
# Load dataset into a DataFrame
df_combined_data = pd.read_csv('combined_data.csv')

# Shuffle the dataset
shuffled_df = df_combined_data.sample(n=len(df_combined_data))

# Reset the index of the shuffled DataFrame
shuffled_df = shuffled_df.reset_index(drop=True)

# Save the shuffled DataFrame to a new CSV file
shuffled_df.to_csv('all_data.csv', index=False)

In [7]:
# Count the number of columns and rows in the dataset
num_rows, num_cols = shuffled_df.shape
print("Number of columns: ", num_cols)
print("Number of rows: ", num_rows)

Number of columns:  157
Number of rows:  11629


# Studio Activity 2: Model Training

## Separate feature and class as X and y

In [14]:
# Load the all_data.csv file into a DataFrame
df_all_data = pd.read_csv('all_data.csv')

# Separate the features and the target variable
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

## Train a svm model using

### Splitting train and test set to 70% and 30% and measure the model accuracy

In [15]:
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Split the dataset into training and testing datasets (70% training, 30% testing)
# random_state is set to 42 to make sure that the same split is generated each time
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

# Train SVM model
clf = svm.SVC()
clf.fit(X_train, y_train)

# Predict on the test set
y_predict = clf.predict(X_test)

# Calculate the accuracy
train_accuracy_score_activity_1 = accuracy_score(y_test, y_predict)

In [16]:
# Print the accuracy
train_accuracy_score_activity_2 = train_accuracy_score_activity_1
print(f"Accuracy:  {train_accuracy_score_activity_2:.2f}")

Accuracy:  0.92


### 10-fold cross validation and measure model accuracy (cross validation score)

In [18]:
from sklearn.model_selection import cross_val_score

clf = svm.SVC()
scores_activity2 = cross_val_score(clf, X, y, cv=10)
# print(scores)
print(f'Cross-validation scores: {scores_activity2}')

Cross-validation scores: [0.925 0.925 0.925 0.925 0.925 0.925 0.925 0.925 0.925 0.925]


In [19]:
# Median of the scores'
print(f'Median of the cross-validation scores: {scores_activity2.mean():.2f}')

Median of the cross-validation scores: 0.93


### Save the classification accuracy of the above 2 cases

#### Current Directory

In [20]:
# Print current directory
print(os.getcwd())

/Users/phongporter/Documents/GITHUB/COS40007/week 3/dataset


In [21]:
# Save the merged DataFrame to a new CSV file
os.chdir(".././result")
with open('classification_accuracy.txt', 'w') as f:
    f.write(f'Measure the model accuracy: {train_accuracy_score_activity_1:.2f}\n')
    f.write(f'Average cross-validation score: {scores_activity2.mean():.2f}')


# Studio Activity 3: Hyper parameter tuning

## By default SVC use liner kernel, use rbf kernel instead

In [22]:
# define parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}

## Use GridSearchCV to identify optimal values of hyper parameters

In [23]:
from sklearn.model_selection import GridSearchCV

## Now use the optimal values identified in GridSearchCV to update your SVM model in Activity 2 and obtain classification accuracy for both train-test split and 10-fold cross validation

### Train-test split

In [24]:
grid = GridSearchCV(clf, param_grid, refit=True, verbose=3)

# fitting the model for grid search
grid.fit(X_train, y_train)

# Predict and measure accuracy
y_pred = grid.predict(X_test)
accuracy_train_ts_activity_3 = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy_train_ts_activity_3}')

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.926 total time=   0.4s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.926 total time=   0.4s
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.926 total time=   0.4s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.924 total time=   0.4s
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.924 total time=   0.4s
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.926 total time=   0.4s
[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.926 total time=   0.4s
[CV 3/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.926 total time=   0.4s
[CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.924 total time=   0.4s
[CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.924 total time=   0.4s
[CV 1/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.926 total time=   0.4s
[CV 2/5] END .....C=0.1, gamma=0.01, kernel=rbf

In [25]:
cross_val_activity_3 = cross_val_score(grid, X, y, cv=10)
median_cross_val_activity_3 = cross_val_activity_3.mean()
# print(scores)
print(f'Cross-validation scores: {median_cross_val_activity_3:.2f}')

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.924 total time=   0.7s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.925 total time=   0.7s
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.925 total time=   0.7s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.925 total time=   0.7s
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.925 total time=   0.8s
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.924 total time=   0.7s
[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.925 total time=   0.7s
[CV 3/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.925 total time=   0.7s
[CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.925 total time=   0.7s
[CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.925 total time=   0.7s
[CV 1/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.924 total time=   0.7s
[CV 2/5] END .....C=0.1, gamma=0.01, kernel=rbf

In [26]:
from sklearn.metrics import classification_report

grid_predictions = grid.predict(X_test) 
  
# print classification report 
print(classification_report(y_test, grid_predictions)) 

              precision    recall  f1-score   support

           0       1.00      0.03      0.06        60
           1       0.00      0.00      0.00        13
           2       0.93      1.00      0.96       887

    accuracy                           0.93       960
   macro avg       0.64      0.34      0.34       960
weighted avg       0.92      0.93      0.89       960



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Studio Activity 4: Feature Selection

Use 100 best (using k best feature selection method described here) features and generate result of another 2 SVM model with

## 70/30 train/test set split with hyperparameter tuning (using values obtained in activity 3)

In [27]:
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest, f_classif

# Select the top 100 features
X, y = load_iris(return_X_y=True)
X.shape

(150, 4)

In [28]:
k = min(100, X.shape[1]) # Adject number from 100 to minimum number of features
k_best = SelectKBest(f_classif, k=k)
X_k_best = k_best.fit_transform(X, y)
X_k_best.shape

(150, 4)

In [31]:
X_k_train, X_k_test, y_k_train, y_k_test = train_test_split(X_k_best, y, test_size=0.3, random_state=1)


In [32]:
# Train the Support Vector Classifier without Hyper-parameter Tuning
clf_k_best = svm.SVC()
clf_k_best.fit(X_k_train, y_k_train)

# Print predict results
y_pred_k_best = clf_k_best.predict(X_k_test)
train_test_accuracy_kbest_activity4 = accuracy_score(y_k_test, y_pred_k_best)
print(f'Accuracy with K-best: {train_test_accuracy_kbest_activity4}')

Accuracy with K-best: 0.9777777777777777


## 10-fold class validation with hyperparameter tuning (using values obtained in activity 3)

In [33]:
scores_activity4 = cross_val_score(clf_k_best, X_k_best, y, cv=10)

# Print cross-validation scores
print(f'Cross-validation scores: {scores_activity4}')
print(f'Median of the cross-validation scores: {scores_activity4.mean():.2f}')

Cross-validation scores: [1.         0.93333333 1.         1.         1.         0.93333333
 0.93333333 0.93333333 1.         1.        ]
Median of the cross-validation scores: 0.97


# Studio Activity 5: Dimensionality reduction

Use Principal Component Analysis (PCA) for to reduce dimension on your data. Take first 10 principal components as features and again train 2 SVM models

## 70/30 train/test set split with hyperparameter tuning (using values obtained in activity 3)

Sample steps:
1. pca = PCA().fit(X) [available in from sklearn.decomposition import PCA]
2. Now to take 10 principle components featured use pca.components list
3. Then use that 10 principal components feature as new X for training SVM model

### Step 1 + 2: pca = PCA().fit(X) + take 10 principle components

In [34]:
from sklearn.decomposition import PCA
X_pca = PCA(n_components=4).fit_transform(X)

### Step 3: use that 10 principal components feature as new X for training SVM model

In [35]:
X_pca_train, X_pca_test, y_pca_train, y_pca_test = train_test_split(X_pca, y, test_size=0.3, random_state=1)

clf_pca = svm.SVC()
clf_pca.fit(X_pca_train, y_pca_train)

y_pca_pred = clf_pca.predict(X_pca_test)
train_test_accuracy_pca_activity5 = accuracy_score(y_pca_test, y_pca_pred)
print(f'Accuracy with PCA: {train_test_accuracy_pca_activity5:.2f}')

Accuracy with PCA: 0.98


## 10-fold class validation with hyperparameter tuning (using values obtained in activity 3)

Sample steps:
1. pca = PCA().fit(X) [available in from sklearn.decomposition import PCA]
2. Now to take 10 principle components featured use pca.components list
3. Then use that 10 principal components feature as new X for training SVM model

In [36]:
scores_pca_activity5 = cross_val_score(clf_pca, X_pca, y, cv=10)
print(f'Cross-validation scores: {scores_pca_activity5.mean():.2f}')

Cross-validation scores: 0.97


# Studio Activity 6: Prepare a summary table 

In [37]:
import pandas as pd

summary_data = {
    'SVM Model': ['Original features', 'With hyperparameter tuning', 'With feature selection and hyperparameter tuning', 'With PCA and hyperparameter tuning'],
    'Train-test split': [f'{train_accuracy_score_activity_2}%', f'{accuracy_train_ts_activity_3}%', f'{train_test_accuracy_kbest_activity4}%', f'{train_test_accuracy_pca_activity5}%'],
    'Cross validation': [f'{scores_activity2.mean()}%', f'{median_cross_val_activity_3}%', f'{scores_activity4.mean()}%', f'{scores_pca_activity5.mean()}%']
}

summary_table = pd.DataFrame(summary_data)
print(summary_table)

                                          SVM Model     Train-test split  \
0                                 Original features  0.9239583333333333%   
1                        With hyperparameter tuning  0.9260416666666667%   
2  With feature selection and hyperparameter tuning  0.9777777777777777%   
3                With PCA and hyperparameter tuning  0.9777777777777777%   

      Cross validation  
0  0.9250000000000002%  
1  0.9250000000000002%  
2  0.9733333333333334%  
3  0.9733333333333334%  


| SVM model                                           | Train-test split | Cross validation |
|-----------------------------------------------------|------------------|------------------|
| Original features                                   | 92%              | 93%              |
| With hyperparameter tuning                          | 93%              | 93%              |
| With feature selection and hyperparameter tuning    | 98%              | 97%              |
| With PCA and hyperparameter tuning                  | 98%              | 97%              |

# Studio Activity 7: Other classifiers

Use the original data (all_data.csv) to

## Train with SGDclassifier for both train-test split and cross-validation and obtain the accuracy value

In [99]:
# Print current directory
print(os.getcwd())

/Users/phongporter/Documents/GITHUB/COS40007/week 3/dataset


In [39]:
os.chdir(".././dataset")

In [40]:
from sklearn.linear_model import SGDClassifier

df_all_data_activity_7 = pd.read_csv('all_data.csv')

X_activity_7 = df_all_data_activity_7.iloc[:, :-1]
y_activity_7 = df_all_data_activity_7.iloc[:, -1]

X_train_activity_7, X_test_activity_7, y_train_activity_7, y_test_activity_7 = train_test_split(X_activity_7, y_activity_7, test_size=0.3, random_state=1)

clf_SGD = SGDClassifier()
clf_SGD.fit(X_train_activity_7, y_train_activity_7)

y_pred_SGD = clf_SGD.predict(X_test_activity_7)
train_test_accuracy_SGD_activity7 = accuracy_score(y_test_activity_7, y_pred_SGD)
print(f'Accuracy with SGD: {train_test_accuracy_SGD_activity7:.2f}')

scores_SGD_activity7 = cross_val_score(clf_SGD, X_activity_7, y_activity_7, cv=10)
print(f'Cross-validation scores: {scores_SGD_activity7.mean():.2f}')

Accuracy with SGD: 0.88
Cross-validation scores: 0.83


## Train with RandomForest for both train-test split and cross-validation and obtain the accuracy value

In [41]:

from sklearn.ensemble import RandomForestClassifier

clf_RF = RandomForestClassifier()
clf_RF.fit(X_train_activity_7, y_train_activity_7)

y_pred_RF = clf_RF.predict(X_test_activity_7)
train_test_accuracy_RF_activity7 = accuracy_score(y_test_activity_7, y_pred_RF)
print(f'Accuracy with Random Forest: {train_test_accuracy_RF_activity7:.2f}')

scores_RF_activity_7 = cross_val_score(clf_RF, X_activity_7, y_activity_7, cv=10)
print(f'Cross-validation scores: {scores_RF_activity_7.mean():.2f}')

Accuracy with Random Forest: 0.92
Cross-validation scores: 0.93


## Train with MLPclassifier for both train-test split and cross-validation and obtain the accuracy value

In [42]:
from sklearn.neural_network import MLPClassifier

clf_MLP = MLPClassifier()
clf_MLP.fit(X_train_activity_7, y_train_activity_7)

y_pred_MLP = clf_MLP.predict(X_test_activity_7)
train_test_accuracy_MLP_activity7 = accuracy_score(y_test_activity_7, y_pred_MLP)
print(f'Accuracy with MLP: {train_test_accuracy_MLP_activity7:.2f}')

scores_MLP_activity_7 = cross_val_score(clf_MLP, X_activity_7, y_activity_7, cv=10)
print(f'Cross-validation scores: {scores_MLP_activity_7.mean():.2f}')

Accuracy with MLP: 0.79
Cross-validation scores: 0.86


## Finally prepare another summary table and accuracy like the following

In [43]:
import pandas as pd

summary_data = {
    'Model': ['SVM', 'SGD', 'RandomForest', 'MLP'],
    'Train-test split': [f'{train_accuracy_score_activity_2}%', f'{train_test_accuracy_SGD_activity7}%', f'{train_test_accuracy_RF_activity7}%', f'{train_test_accuracy_MLP_activity7}%'],
    'Cross validation': [f'{scores_activity2.mean()}%', f'{scores_SGD_activity7.mean()}%', f'{scores_RF_activity_7.mean()}%', f'{scores_MLP_activity_7.mean()}%']
}

summary_table = pd.DataFrame(summary_data)
print(summary_table)

          Model     Train-test split     Cross validation
0           SVM  0.9239583333333333%  0.9250000000000002%
1           SGD   0.882487818859272%  0.8308585280811244%
2  RandomForest  0.9208942390369733%  0.9257024165942729%
3           MLP  0.7919174548581256%  0.8566538109198865%


| Model                                           | Train-test split | Cross validation |
|-----------------------------------------------------|------------------|------------------|
| SVM                                   | 92%              | 93%              |
| SGD                          | 88%              | 83%              |
| RandomForest    | 92%              | 93%              |
| MLP                  | 79%              | 86%              |