<a href="https://colab.research.google.com/github/AnthonyTsiantis/Boeing-Innovation-Group1/blob/main/classifier_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import sklearn
import sklearn.model_selection
import sklearn.svm
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import tqdm
import os
import pandas as pd
import joblib

# Load Data (files need to be uploaded to colab first)

In [None]:
def get_df(datapath):
      frames = []
      df = pd.read_csv(f'{datapath}/Notre Dame Project Data - 1 of 10.csv', header=None)
      headers = df.iloc[0].tolist()
      df = df.drop(0)
      for file in [f'{datapath}/Notre Dame Project Data - {i} of 10.csv' for i in range(2, 11)]:
          df = pd.read_csv(file, header=None)
          df = df.drop(index=0)
          frames.append(df)
      dataset = pd.concat(frames)
      dataset.columns = headers
      return dataset

dataset = get_df('.')

# wuc seems to be like an ID number
print(dataset)

# Text Embedding

In [None]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(module_url)
print ("module %s loaded" % module_url)

In [None]:
em_corr_action = model(dataset['corr_action'])
em_descrep_narrative = model(dataset['descrep_narrative'])
em_system_reason_desc = model(dataset['system_reason_desc'])

In [None]:
# compile embedded vectors into data frame
em_data = pd.DataFrame()

em_data['corr_action'] = np.array(em_corr_action).tolist()
em_data['descrep_narrative'] = np.array(em_descrep_narrative).tolist()
em_data['system_reason_desc'] = np.array(em_system_reason_desc).tolist()

# SVC Classifier - one input column

In [None]:
# testing and training on descrep_narrative data
# np.array() and .tolist() are used to tranform the encoder's output data from a tensor to a list
input_data = np.array(em_data['descrep_narrative']).tolist()


# classifier for column C
output_data = dataset["wuc"].copy()

X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(input_data, output_data, test_size = 0.3, random_state=42)


In [None]:
svc = sklearn.svm.SVC(random_state=42)

# train SVC model
svc.fit(X_train, y_train)

# test accuracy on SVC model
accuracy = svc.score(X_test, y_test)
print(f"{accuracy * 100}%")

# SVC Classifier - all input columns

In [None]:
# include all input data vectors in one data frame
em_df_exp = pd.DataFrame()

count = 0

import warnings
warnings.filterwarnings("ignore", message="DataFrame is highly fragmented.")

# expand data frame so each coefficient of input vector for each category has a column
for col in em_data.columns:
  data = {}
  for item in em_data[col]:
    count = 0
    for element in item:
      if count in data:
        data[count].append(element)
      else:
        data[count] = [element]
      count += 1

  for key in data:
    em_df_exp[f'{col}{key}'] = data[key].copy()

In [None]:
print(em_df_exp)

In [None]:
# predict column C
output_data = dataset["wuc"].copy()

X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(em_df_exp, output_data, test_size = 0.3, random_state=42)

svc = sklearn.svm.SVC(random_state=42)

# train SVC model
svc.fit(X_train, y_train)

# test accuracy on SVC model
accuracy = svc.score(X_test, y_test)
print(f"{accuracy * 100}%")
y_preds = svc.predict(X_test)
print(y_preds)
print(y_test)
""" Save Model
model_filename = 'svc_model.joblib'
joblib.dump(svc, model_filename)
"""
# no way 100% accuracy

In [None]:
# predict column X
output_data = dataset["wc_code"].copy()

X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(em_df_exp, output_data, test_size = 0.3, random_state=42)

svc = sklearn.svm.SVC(random_state=35)

# train SVC model
svc.fit(X_train, y_train)

# test accuracy on SVC model
accuracy = svc.score(X_test, y_test)
print(f"{accuracy * 100}%")

# no way 100% accuracy

y_preds = svc.predict(X_test)
print(X_test)
print(y_preds)
print(y_test)
"""
single_row_to_predict = X_test.iloc[0:2, :]
print(single_row_to_predict)
single_row_pred = svc.predict(single_row_to_predict.values.reshape(2, -1))
print(single_row_pred)
print("y_test:",y_test)
"""