In [1]:
from datasets import load_dataset, Dataset
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from itertools import chain

## Using the model we trained (2 Labels)

In [2]:
#test with autotokenizer
newTokenizer = AutoTokenizer.from_pretrained("gohbwj/sentiment-fine-tuned-yelp-2L")
newModel = AutoModelForSequenceClassification.from_pretrained("gohbwj/sentiment-fine-tuned-yelp-2L")

text = "Order here all year. Yesterday after making me wait for 3 hours! Food still wasn't ready. From 45 minutes delivery to more than 3 hours! Guess people don't appreciate long term customers."
encoded_input = newTokenizer(text, return_tensors='pt')
output = newModel(**encoded_input)
print(newModel.config.id2label[output.logits.argmax().item()])

LABEL_0


In [3]:
#test with pipeline
sentiment_model = pipeline(model="gohbwj/sentiment-fine-tuned-yelp-2L")

sentiment_model(["Order here all year. Yesterday after making me wait for 3 hours! Food still wasn't ready. From 45 minutes delivery to more than 3 hours! Guess people don't appreciate long term customers.",
                 "They have an excellent selection even though it is in a Coffeeshop, in fact they taste better than some of the actual restaurant I went to.",
                "The waffles were really not bad. The idea of the beaker for the syrup is creative and the yuzu ice cream wasn’t icy or too sugary.",
                "Staff let customers wait outside for too long time. There were just 3 people in front of me, and there were really enough sits in the restaurant. Howerver, I waited for 30 minutes just for takeaway. Staff didn't care the people waiting and moved so slowly. The worst mcdonals I've ever visited in Singapore.",
                "Small family diner with cozy vibes but the quality of food was somewhat subpar."],
               top_k=1)

Downloading:   0%|          | 0.00/982 [00:00<?, ?B/s]

[[{'label': 'LABEL_0', 'score': 0.9973719120025635}],
 [{'label': 'LABEL_1', 'score': 0.9998078942298889}],
 [{'label': 'LABEL_1', 'score': 0.9978553652763367}],
 [{'label': 'LABEL_0', 'score': 0.9983545541763306}],
 [{'label': 'LABEL_0', 'score': 0.504268229007721}]]

### Test on labelled_10k

In [4]:
# load
test_dataset = load_dataset('csv', data_files="..\\..\\scraper\\data\\labelled_10k.csv")

def tokenizeFunction(examples):
    return newTokenizer(examples["text"], max_length=550, padding="max_length", truncation=True)


Using custom data configuration default-bbf04892c4815010
Found cached dataset csv (C:/Users/Admin/.cache/huggingface/datasets/csv/default-bbf04892c4815010/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)


  0%|          | 0/1 [00:00<?, ?it/s]

In [5]:
#hold a copy as df for later
backupdf = pd.DataFrame(test_dataset['train'])

#get only the input data
test_dataset = test_dataset.remove_columns('restaurant name')
test_dataset = test_dataset.remove_columns('rating')
test_dataset = test_dataset.remove_columns('Tokenized')
test_dataset = test_dataset.remove_columns('predicted_subjectivity')
test_dataset = test_dataset.remove_columns('label')
test_dataset = test_dataset.remove_columns('content')
test_dataset = test_dataset.rename_column('content_clean', 'text')

In [6]:
#need to trunctate as model only handles max 512 tensors
def trimNtrunc(examples):
    try:
        examples['text'] = examples['text'].replace(r'\n', '')
        if len(examples['text']) > 512:
            examples['text'] = examples['text'][:512]
    except:
        examples['text'] = ''
        pass
    return examples

test_dataset = test_dataset['train'].map(trimNtrunc)

  0%|          | 0/1000 [00:00<?, ?ex/s]

In [7]:
#tokenize the datasets
tokenized_dataset = test_dataset.map(tokenizeFunction, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [8]:
%%time
#run thru the model and get output
output = sentiment_model(tokenized_dataset['text'], top_k=2)

CPU times: total: 4min 21s
Wall time: 32.7 s


In [9]:
# do some magic to format the data
outputdf = pd.DataFrame(list(chain.from_iterable(output)))
evendf = outputdf.iloc[0:len(outputdf):2]
odddf = outputdf.iloc[1:len(outputdf):2]
evendf.reset_index(drop=True, inplace=True)
odddf.reset_index(drop=True, inplace=True)
outputdf = evendf.join(odddf, lsuffix="_pred", rsuffix="_other")

outputdf['label_pred'] = outputdf['label_pred'].replace({'LABEL_0':0.0, 'LABEL_1':1.0})
outputdf['label_other'] = outputdf['label_other'].replace({'LABEL_0':0.0, 'LABEL_1':1.0})

# def pred_label(row):
#     if row['score_0'] > row['score_1']:
#         return 0.0
#     if row['score_1'] > row['score_0']:
#         return 1.0
#     return 1.0

# def pred_score(row):
#     if row['label_pred'] == 0.0:
#         return row['score_0']
#     if row['label_pred'] == 1.0:
#         return row['score_1']

# outputdf['label_pred'] = outputdf.apply (lambda row: pred_label(row), axis=1)
# outputdf['score_pred'] = outputdf.apply (lambda row: pred_score(row), axis=1)

In [10]:
#combine back
combine_result = pd.concat([backupdf,  outputdf], axis=1)
combine_result

Unnamed: 0,restaurant name,rating,content,label,content_clean,Tokenized,predicted_subjectivity,label_pred,score_pred,label_other,score_other
0,Konomi Zen,3,Crunchy tempura esp the vegetables,2.0,Crunchy the vegetable,"[[('Crunchy', 'NNP'), ('the', 'DT'), ('vegetab...",0.000000,1.0,0.999585,0.0,0.000415
1,Vincent Western Food,5,this is one of the best western food i've eate...,1.0,this is one of the best western food eaten the...,"[[('this', 'DT'), ('is', 'VBZ'), ('one', 'CD')...",0.307692,1.0,0.999727,0.0,0.000273
2,Siam Square Mookata - Best Mookata Restaurant ...,5,Many choice of food to select. Love their teri...,1.0,Many choice of food to select Love their pork ...,"[[('Many', 'JJ'), ('choice', 'NN'), ('of', 'IN...",0.633333,0.0,0.766087,1.0,0.233913
3,Old Chang Kee,1,Buying snacks for customers but system mainten...,0.0,snack for customer but system maintenance cant...,"[[('snack', 'NN'), ('for', 'IN'), ('customer',...",0.000000,0.0,0.988703,1.0,0.011297
4,Hiang Ji Cantonese Roasts,1,Seriously overprice and rude service. Avoid at...,0.0,Seriously overprice and rude service Avoid at ...,"[[('Seriously', 'RB'), ('overprice', 'NN'), ('...",0.633333,0.0,0.999337,1.0,0.000662
...,...,...,...,...,...,...,...,...,...,...,...
995,Prata Raya,3,Mutton Nasi Biryani\n\n🍜 Food wise: Overall it...,1.0,Mutton Nasi ramen Food wise Overall it wa a d...,"[[('Mutton', 'NNP'), ('Nasi', 'NNP'), ('ramen'...",0.584524,1.0,0.994324,0.0,0.005676
996,Chui Xiang Kitchen,4,Really nice Zi Char place and pretty affordabl...,1.0,Really nice Char place and pretty affordable t...,"[[('Really', 'RB'), ('nice', 'JJ'), ('Char', '...",0.710000,1.0,0.999884,0.0,0.000116
997,A Hot Hideout,5,"honestly the best mala i’ve had in singapore, ...",1.0,honestly the best mala i ’ had in and the way...,"[[('honestly', 'RB'), ('the', 'DT'), ('best', ...",0.637500,1.0,0.999886,0.0,0.000114
998,Chui Huay Lim Teochew Cuisine 醉花林品潮轩,3,"Difficult to get to, food quality is good, the...",1.0,Difficult to get to food quality is good the p...,"[[('Difficult', 'NN'), ('to', 'TO'), ('get', '...",0.545212,0.0,0.992883,1.0,0.007117


In [11]:
# get metric scores
print('Accuracy: %.3f' % accuracy_score(combine_result['label'], combine_result['label_pred']))
print('Precision: %.3f' % precision_score(combine_result['label'], combine_result['label_pred'], average="weighted"))
print('Recall: %.3f' % recall_score(combine_result['label'], combine_result['label_pred'], average="weighted"))
print('F1: %.3f' % f1_score(combine_result['label'], combine_result['label_pred'], average="weighted"))

Accuracy: 0.879
Precision: 0.830
Recall: 0.879
F1: 0.853


  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
#save to csv
#version 1 (text, label, score)
single_score_version = combine_result[['content_clean', 'label_pred', 'score_pred']]
single_score_version.rename(columns={"content_clean": "text", "label_pred" : "label", "score_pred": "score"}, inplace=True)
single_score_version.to_csv("DistilBert_pred_labelled10k_single.csv", index=False)

#version 2 (text, label, score_pred, score_other)
double_score_version = combine_result[['content_clean', 'label_pred', 'score_pred', 'score_other']]
double_score_version.rename(columns={"content_clean": "text", "label_pred" : "label"}, inplace=True)
double_score_version.to_csv("DistilBert_pred_labelled10k_double.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_score_version.rename(columns={"content_clean": "text", "label_pred" : "label", "score_pred": "score"}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  double_score_version.rename(columns={"content_clean": "text", "label_pred" : "label"}, inplace=True)


In [13]:
#just make sure the shapes same as original set
print("original: " + str(backupdf.shape))
print("single_score_version: " + str(single_score_version.shape))
print("double_score_version: " + str(double_score_version.shape))

original: (1000, 7)
single_score_version: (1000, 3)
double_score_version: (1000, 4)


### Test on unlabelled_10k

In [15]:
# load
test_dataset = load_dataset('csv', data_files="..\\..\\scraper\\data\\unlabelled_10k.csv")

def tokenizeFunction(examples):
    return newTokenizer(examples["text"], max_length=550, padding="max_length", truncation=True)


Using custom data configuration default-d1918eb279f8da62


Downloading and preparing dataset csv/default to C:/Users/Admin/.cache/huggingface/datasets/csv/default-d1918eb279f8da62/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to C:/Users/Admin/.cache/huggingface/datasets/csv/default-d1918eb279f8da62/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a. Subsequent calls will reuse this data.


  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)


  0%|          | 0/1 [00:00<?, ?it/s]

In [16]:
#hold a copy as df for later
backupdf = pd.DataFrame(test_dataset['train'])

#get only the input data
test_dataset = test_dataset.remove_columns('restaurant name')
test_dataset = test_dataset.remove_columns('rating')
test_dataset = test_dataset.remove_columns('Tokenized')
test_dataset = test_dataset.remove_columns('predicted_subjectivity')
test_dataset = test_dataset.remove_columns('label')
test_dataset = test_dataset.remove_columns('content')
test_dataset = test_dataset.rename_column('content_clean', 'text')

In [17]:
#need to trunctate as model only handles max 512 tensors
def trimNtrunc(examples):
    try:
        examples['text'] = examples['text'].replace(r'\n', '')
        if len(examples['text']) > 512:
            examples['text'] = examples['text'][:512]
    except:
        examples['text'] = ''
        pass
    return examples
test_dataset = test_dataset['train'].map(trimNtrunc)

  0%|          | 0/8895 [00:00<?, ?ex/s]

In [18]:
#tokenize the datasets
tokenized_dataset = test_dataset.map(tokenizeFunction, batched=True)

  0%|          | 0/9 [00:00<?, ?ba/s]

In [19]:
%%time
#run thru the model and get output
output = sentiment_model(tokenized_dataset['text'], top_k=2)

In [20]:
# do some magic to format the data
outputdf = pd.DataFrame(list(chain.from_iterable(output)))
evendf = outputdf.iloc[0:len(outputdf):2]
odddf = outputdf.iloc[1:len(outputdf):2]
evendf.reset_index(drop=True, inplace=True)
odddf.reset_index(drop=True, inplace=True)
outputdf = evendf.join(odddf, lsuffix="_pred", rsuffix="_other")

outputdf['label_pred'] = outputdf['label_pred'].replace({'LABEL_0':0.0, 'LABEL_1':1.0})
outputdf['label_other'] = outputdf['label_other'].replace({'LABEL_0':0.0, 'LABEL_1':1.0})

In [21]:
#combine back
combine_result = pd.concat([backupdf,  outputdf], axis=1)
combine_result

Unnamed: 0,restaurant name,rating,content,label,content_clean,Tokenized,predicted_subjectivity,label_pred,score_pred,label_other,score_other
0,TungLok Teahouse,5,Lisa is a very good host and made us feel very...,1,is a very good host and made u feel very welco...,"[[('is', 'VBZ'), ('a', 'DT'), ('very', 'RB'), ...",0.468571,1.0,0.999874,0.0,0.000126
1,Malaysia Boleh,5,Very nice piping hot claypot with dark sauce a...,1,Very nice piping hot with dark sauce and sesam...,"[[('Very', 'RB'), ('nice', 'JJ'), ('piping', '...",0.725714,1.0,0.999867,0.0,0.000133
2,Douraku Sushi,5,Such luxuriously enjoyable Omakase 🤗\r\nRich s...,1,Such luxuriously enjoyable hugging face Rich s...,"[[('Such', 'JJ'), ('luxuriously', 'RB'), ('enj...",0.583333,1.0,0.999831,0.0,0.000169
3,511 Indian Store,5,Nice India ingredients foods,1,Nice ingredient food,"[[('Nice', 'NNP'), ('ingredient', 'NN'), ('foo...",1.000000,1.0,0.996541,0.0,0.003459
4,The Teochew Kitchenette,4,Ordered the Stir-Fry Kang Kong and Marmite Chi...,1,Ordered the Stir-Fry Kang and Marmite Chicken ...,"[[('Ordered', 'VBN'), ('the', 'DT'), ('Stir-Fr...",0.742857,1.0,0.997530,0.0,0.002470
...,...,...,...,...,...,...,...,...,...,...,...
8890,Ichikokudo Hokkaido Ramen,4,Wasn’t crowded during the dinner period.\r\nDe...,1,’ t crowded during the dinner period . Decent ...,"[[('’', 'JJ'), ('t', 'NN'), ('crowded', 'VBD')...",0.697778,1.0,0.936226,0.0,0.063774
8891,Kedai Makan Muhajirin,4,"Had the mee rebus, mee siam and nasi lemak wit...",1,"Had the rebus , and nasi with , all in all an ...","[[('Had', 'VBD'), ('the', 'DT'), ('rebus', 'NN...",0.675000,1.0,0.929509,0.0,0.070491
8892,Shami Banana Leaf,5,"MY FAVOURITE INDIAN RESTAURANT.\r\nTheir soya,...",1,"MY RESTAURANT . Their soya , sambal & potato a...","[[('MY', 'PRP$'), ('RESTAURANT', 'NNP'), ('.',...",0.700000,1.0,0.999437,0.0,0.000563
8893,Rahim Muslim Food,4,The taste is unlike the usual Mee rebus you fi...,1,The taste is unlike the usual rebus you find e...,"[[('The', 'DT'), ('taste', 'NN'), ('is', 'VBZ'...",0.570238,1.0,0.999422,0.0,0.000578


In [22]:
# get metric scores
print('Accuracy: %.3f' % accuracy_score(combine_result['label'], combine_result['label_pred']))
print('Precision: %.3f' % precision_score(combine_result['label'], combine_result['label_pred'], average="weighted"))
print('Recall: %.3f' % recall_score(combine_result['label'], combine_result['label_pred'], average="weighted"))
print('F1: %.3f' % f1_score(combine_result['label'], combine_result['label_pred'], average="weighted"))

Accuracy: 0.905
Precision: 0.929
Recall: 0.905
F1: 0.911


In [23]:
#save to csv
#version 1 (text, label, score)
single_score_version = combine_result[['content_clean', 'label_pred', 'score_pred']]
single_score_version.rename(columns={"content_clean": "text", "label_pred" : "label", "score_pred": "score"}, inplace=True)
single_score_version.to_csv("DistilBert_pred_unlabelled10k_single.csv", index=False)

#version 2 (text, label, score_pred, score_other)
double_score_version = combine_result[['content_clean', 'label_pred', 'score_pred', 'score_other']]
double_score_version.rename(columns={"content_clean": "text", "label_pred" : "label"}, inplace=True)
double_score_version.to_csv("DistilBert_pred_unlabelled10k_double.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_score_version.rename(columns={"content_clean": "text", "label_pred" : "label", "score_pred": "score"}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  double_score_version.rename(columns={"content_clean": "text", "label_pred" : "label"}, inplace=True)


In [24]:
#just make sure the shapes same as original set
print("original: " + str(backupdf.shape))
print("single_score_version: " + str(single_score_version.shape))
print("double_score_version: " + str(double_score_version.shape))

original: (8895, 7)
single_score_version: (8895, 3)
double_score_version: (8895, 4)


### Test on yelp_review_after_subjectivity_classification

In [25]:
# load
test_dataset = load_dataset('csv', data_files="..\\..\\scraper\\data\\yelp_review_after_subjectivity_classification.csv")

def tokenizeFunction(examples):
    return newTokenizer(examples["text"], max_length=550, padding="max_length", truncation=True)


Using custom data configuration default-87a8e87d5d978e11


Downloading and preparing dataset csv/default to C:/Users/Admin/.cache/huggingface/datasets/csv/default-87a8e87d5d978e11/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)


Dataset csv downloaded and prepared to C:/Users/Admin/.cache/huggingface/datasets/csv/default-87a8e87d5d978e11/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [26]:
#hold a copy as df for later
backupdf = pd.DataFrame(test_dataset['train'])

#get only the input data
test_dataset = test_dataset.remove_columns('Tokenized')
test_dataset = test_dataset.remove_columns('predicted_subjectivity')
test_dataset = test_dataset.remove_columns('label')
test_dataset = test_dataset.remove_columns('content')
test_dataset = test_dataset.rename_column('content_clean', 'text')

In [27]:
#need to trunctate as model only handles max 512 tensors
def trimNtrunc(examples):
    try:
        examples['text'] = examples['text'].replace(r'\n', '')
        if len(examples['text']) > 512:
            examples['text'] = examples['text'][:512]
    except:
        examples['text'] = ''
        pass
    return examples

test_dataset = test_dataset['train'].map(trimNtrunc)

  0%|          | 0/49639 [00:00<?, ?ex/s]

In [28]:
#tokenize the datasets
tokenized_dataset = test_dataset.map(tokenizeFunction, batched=True)

  0%|          | 0/50 [00:00<?, ?ba/s]

In [29]:
%%time
#run thru the model and get output
output = sentiment_model(tokenized_dataset['text'], top_k=2)

In [30]:
# do some magic to format the data
outputdf = pd.DataFrame(list(chain.from_iterable(output)))
evendf = outputdf.iloc[0:len(outputdf):2]
odddf = outputdf.iloc[1:len(outputdf):2]
evendf.reset_index(drop=True, inplace=True)
odddf.reset_index(drop=True, inplace=True)
outputdf = evendf.join(odddf, lsuffix="_pred", rsuffix="_other")

outputdf['label_pred'] = outputdf['label_pred'].replace({'LABEL_0':0.0, 'LABEL_1':1.0})
outputdf['label_other'] = outputdf['label_other'].replace({'LABEL_0':0.0, 'LABEL_1':1.0})

In [31]:
#combine back
combine_result = pd.concat([backupdf,  outputdf], axis=1)
combine_result

Unnamed: 0,content,label,content_clean,Tokenized,predicted_subjectivity,label_pred,score_pred,label_other,score_other
0,Tried to give this place a second chance and w...,0,Tried to give this place a second chance and w...,"[[('Tried', 'VBN'), ('to', 'TO'), ('give', 'VB...",0.214286,0.0,0.999384,1.0,0.000616
1,My Mom ordered penne pasta and received taglia...,0,My ordered and received instead Delivery perso...,"[[('My', 'PRP$'), ('ordered', 'JJ'), ('and', '...",0.688889,0.0,0.999344,1.0,0.000656
2,The facility is clean and level however the st...,0,The facility is clean and level however the st...,"[[('The', 'DT'), ('facility', 'NN'), ('is', 'V...",0.433333,0.0,0.997516,1.0,0.002484
3,Absolutely do not bother There is a coffee bar...,0,Absolutely do not bother There is a coffee bar...,"[[('Absolutely', 'RB'), ('do', 'VBP'), ('not',...",0.476389,0.0,0.999438,1.0,0.000562
4,Inga is the only competent employee here The e...,0,is the only competent employee here The evenin...,"[[('is', 'VBZ'), ('the', 'DT'), ('only', 'JJ')...",0.594728,0.0,0.998617,1.0,0.001383
...,...,...,...,...,...,...,...,...,...
49634,Wow Talk about your dichotomy of the absolute...,1,Wow Talk about your dichotomy of the absolutel...,"[[('Wow', 'NNP'), ('Talk', 'VBP'), ('about', '...",0.566667,0.0,0.996275,1.0,0.003725
49635,Awesome location right on the water Great beac...,1,Awesome location right on the water Great beac...,"[[('Awesome', 'NNP'), ('location', 'NN'), ('ri...",0.672619,1.0,0.999306,0.0,0.000694
49636,We had the Ropa Vieja and Pork Chop Chuletas F...,1,We had the and Pork Chop both were really real...,"[[('We', 'PRP'), ('had', 'VBD'), ('the', 'DT')...",0.550000,1.0,0.999901,0.0,0.000099
49637,I really enjoyed the place Its small but intim...,1,I really the place Its small but intimate Grea...,"[[('I', 'PRP'), ('really', 'RB'), ('the', 'DT'...",0.568519,1.0,0.999889,0.0,0.000111


In [32]:
# get metric scores
print('Accuracy: %.3f' % accuracy_score(combine_result['label'], combine_result['label_pred']))
print('Precision: %.3f' % precision_score(combine_result['label'], combine_result['label_pred'], average="weighted"))
print('Recall: %.3f' % recall_score(combine_result['label'], combine_result['label_pred'], average="weighted"))
print('F1: %.3f' % f1_score(combine_result['label'], combine_result['label_pred'], average="weighted"))

Accuracy: 0.961
Precision: 0.961
Recall: 0.961
F1: 0.961


In [33]:
#save to csv
#version 1 (text, label, score)
single_score_version = combine_result[['content_clean', 'label_pred', 'score_pred']]
single_score_version.rename(columns={"content_clean": "text", "label_pred" : "label", "score_pred": "score"}, inplace=True)
single_score_version.to_csv("DistilBert_pred_yelp50k_single.csv", index=False)

#version 2 (text, label, score_pred, score_other)
double_score_version = combine_result[['content_clean', 'label_pred', 'score_pred', 'score_other']]
double_score_version.rename(columns={"content_clean": "text", "label_pred" : "label"}, inplace=True)
double_score_version.to_csv("DistilBert_pred_yelp50k_double.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_score_version.rename(columns={"content_clean": "text", "label_pred" : "label", "score_pred": "score"}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  double_score_version.rename(columns={"content_clean": "text", "label_pred" : "label"}, inplace=True)


In [34]:
#just make sure the shapes same as original set
print("original: " + str(backupdf.shape))
print("single_score_version: " + str(single_score_version.shape))
print("double_score_version: " + str(double_score_version.shape))

original: (49639, 5)
single_score_version: (49639, 3)
double_score_version: (49639, 4)
