In [1]:
import pandas as pd
from google.colab import drive
import re
import spacy
from spacy.util import minibatch, compounding
import random
from spacy.training import Example
from spacy.tokens import DocBin
from spacy.util import filter_spans
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix


In [2]:
drive.mount('/content/drive/')

%cd drive/MyDrive/Text Mining Project

Mounted at /content/drive/
/content/drive/MyDrive/Text Mining Project


In [3]:
file_path = 'datasetV2.csv'

df = pd.read_csv(file_path)

In [4]:
df.head()

Unnamed: 0,text,label
0,The in-app chat support feature is incredibly ...,1
1,The ability to track multiple orders simultane...,1
2,I suggest adding a feature to customize delive...,1
3,Providing estimated delivery times for each re...,1
4,Offering discounts for frequent users would en...,1


## Preprocessing

### Clean Text

In [5]:
def clean_text(text):
    # Remove special characters and extra whitespace
    cleaned_text = re.sub(r'[^A-Za-z0-9\s\.]', '', text)
    # Convert multiple whitespace characters into a single space
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    # Convert text to lowercase
    cleaned_text = cleaned_text.lower()
    return cleaned_text

In [6]:
df['text'] = df['text'].apply(clean_text)

## Model 1: knkarthick/Action_Items https://huggingface.co/knkarthick/Action_Items


In [7]:
X_test = df['text']
y_test = df['label']

In [8]:
from huggingface_hub import hf_hub_download

In [9]:
api_key = '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'
model_id = "knkarthick/Action_Items"
filenames = ["config.json", "pytorch_model.bin", "special_tokens_map.json", "tokenizer.json", "tokenizer_config.json", "vocab.txt"]

for filename in filenames:
  downloaded_model_path = hf_hub_download(repo_id=model_id,
                                          filename=filename,
                                          token=api_key)

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [10]:
from transformers import TFAutoModelForSequenceClassification, pipeline, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_id, legacy=False)
model = TFAutoModelForSequenceClassification.from_pretrained(model_id, from_pt=True)

classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


In [11]:
classifier("Appreciate that someone has thought of a concept such as this to help passengers get to their destination safely. Room for improvement: better if a signage or something on the body of the taxi indicates that its connected with Grab Taxi other than the sending of the plate no. to the passenger. Also, it would better if there is a refresh/update button on the app in case you get disconnected fron the network.")

[{'label': 'LABEL_1', 'score': 0.8420268893241882}]

In [12]:
y_pred = []
for text in X_test:
    output = classifier(text)
    label = output[0]['label']
    y_pred.append(label)


In [13]:
label_map = {'LABEL_1': 1, 'LABEL_0': 0,'Label_1': 1, 'Label_0': 0, }
# y_test = [label_map[label] for label in y_test]
y_pred = [label_map[label] for label in y_pred]
# y_pred

In [15]:
print(classification_report(y_test,y_pred))
print('Confusion Matrix:',confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.12      0.21        94
           1       0.62      1.00      0.77       138

    accuracy                           0.64       232
   macro avg       0.81      0.56      0.49       232
weighted avg       0.78      0.64      0.54       232

Confusion Matrix: [[ 11  83]
 [  0 138]]


In [16]:
auc_score = roc_auc_score(y_test, y_pred)
print(f'AUC Score: {auc_score}')

AUC Score: 0.5585106382978724


## Model 2: Yousefmd/feedback-classification https://huggingface.co/Yousefmd/feedback-classification

In [17]:
X_test = df['text']
y_test = df['label']

In [18]:
api_key = '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'
model_id = "Yousefmd/feedback-classification"
filenames = ["added_tokens.json", "config.json", "pytorch_model.bin", "special_tokens_map.json", "tokenizer.json", "tokenizer_config.json", "training_args.bin", "vocab.txt"]

for filename in filenames:
  downloaded_model_path = hf_hub_download(repo_id=model_id,
                                          filename=filename,
                                          token=api_key)

added_tokens.json:   0%|          | 0.00/531 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/875 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.48G [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.68M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/6.86k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.09k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/656k [00:00<?, ?B/s]

In [19]:
from transformers import TFAutoModelForSequenceClassification, pipeline, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_id, legacy=False)
model = TFAutoModelForSequenceClassification.from_pretrained(model_id, from_pt=True)

classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

All the weights of TFBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [20]:
classifier("Appreciate that someone has thought of a concept such as this to help passengers get to their destination safely. Room for improvement: better if a signage or something on the body of the taxi indicates that its connected with Grab Taxi other than the sending of the plate no. to the passenger. Also, it would better if there is a refresh/update button on the app in case you get disconnected fron the network.")

[{'label': 'request', 'score': 0.8532730937004089}]

In [21]:
y_pred = []
for text in X_test:
    output = classifier(text)
    label = output[0]['label']
    if (label == 'request'):
      y_pred.append(1)
    else:
      y_pred.append(0)

In [22]:
# label_map = {'Label_1': 1, 'Label_0': 0 }
# y_test = [label_map[label] for label in y_test]
# y_pred = [label_map[label] for label in y_pred]

In [23]:
print(classification_report(y_test,y_pred))
print('Confusion Matrix:',confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.37      0.51        94
           1       0.69      0.93      0.79       138

    accuracy                           0.71       232
   macro avg       0.74      0.65      0.65       232
weighted avg       0.73      0.71      0.68       232

Confusion Matrix: [[ 35  59]
 [  9 129]]


In [24]:
auc_score = roc_auc_score(y_test, y_pred)
print(f'AUC Score: {auc_score}')

AUC Score: 0.6535615171137836
