# Initial configurations

Upload the following files 

*   AppDescriptions_AI.csv
*   AppDescriptions_AItest.csv
*   AppDescriptions_notAI.csv
*   AppDescriptions_tester.csv

Upload the file for which the data file
and put the name of the file in Parameters Section

# Execution

Execute all the code snippets in the same order

*Note: execute the 'Clean' section (can be found at the end of the document) when there is less memory in colab*

# Parameters and initial upload

In [49]:
!git clone https://github.com/Stellisan/RoBERTa_Text_Classification

# Name of the file to be classified
Data_File_Name = 'final_195k_detail.csv'

Cloning into 'RoBERTa_Text_Classification'...
remote: Enumerating objects: 13, done.[K
remote: Counting objects: 100% (13/13), done.[K
remote: Compressing objects: 100% (11/11), done.[K
remote: Total 13 (delta 0), reused 10 (delta 0), pack-reused 0[K
Unpacking objects: 100% (13/13), done.


# Training Dataset Preprocessing

In [45]:
import pandas as pd

# importing a sample of data that contains the certain words 
# to indicate they use AI and machine learning
ai_data = pd.read_csv('./RoBERTa_Text_Classification/Dataset/AppDescriptions_AI.csv')

# importing a sample of data that do not use machine learning or ai
non_ai_data = pd.read_csv('./RoBERTa_Text_Classification/Dataset/AppDescriptions_notAI.csv')

# printing to check the data
print(ai_data['description'])

0       NỘI DUNG:\n------------------\nĐây là game cờ ...
1       Bot Belotе is your new Belote game, challengin...
2       Slim down in 6 weeks with Weight Loss Fitness ...
3       NEW: Multiplayer Online with Facebook Friends ...
4       Entire Game made by one (1) Developer in 200+ ...
                              ...                        
1341    Mbrane is an all-new multiplayer game that com...
1342    *Cross platform online play! Join your friends...
1343    The best billiards game comes to iTunes. Downl...
1344    DJ AI turns your phone into a DJ\nDJ AI is the...
1345    Paws - Dog breed Identifier is a powerful tool...
Name: description, Length: 1346, dtype: object


In [46]:
import random

# to remove non english characters
# The models are trained using english words
def remove_non_ascii(text): 
    return ''.join(i for i in text if ord(i)<128) 

train_data = []

# seperating the description fo the apps
for i in range(0,len(ai_data['description'])):
  # check for null values
  if(pd.isna(df['description'][i])):
    continue
  t = remove_non_ascii(ai_data['description'][i])

  # if the file does not contain a single letter of english word 
  # it is not added
  if(t == ''):
    continue
  train_data.append([t,ai_data['class'][i]])

for i in range(0,len(non_ai_data['description'])):
  # check for null values
  if(pd.isna(fd['description'][i])):
    continue
  
  t = remove_non_ascii(non_ai_data['description'][i])

  # if the file does not contain a single letter of english word 
  # it is not added
  if(t == ''):
    continue
  train_data.append([t,non_ai_data['class'][i]])

# shuffling the dataset
random.shuffle(train_data)

# Installing the transformer models

In [47]:
# Install the transformer models to be used
# Model used is RoBERTa
!pip install simpletransformers



# Training the model

In [48]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import pandas as pd
import logging


logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# Preparing train data
train_df = pd.DataFrame(train_data)
train_df.columns = ["text", "labels"]

# Optional model configuration
model_args = ClassificationArgs(num_train_epochs=10,overwrite_output_dir= True,reprocess_input_data = True)

# Create a Classification Model
model = ClassificationModel(
    'roberta',
    'roberta-base',
    num_labels=3,
    args=model_args,
) 

# Train the model
model.train_model(train_df)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2483.0), HTML(value='')))




HBox(children=(HTML(value='Epoch'), FloatProgress(value=0.0, max=10.0), HTML(value='')))

HBox(children=(HTML(value='Running Epoch 0 of 10'), FloatProgress(value=0.0, max=311.0), HTML(value='')))






HBox(children=(HTML(value='Running Epoch 1 of 10'), FloatProgress(value=0.0, max=311.0), HTML(value='')))




HBox(children=(HTML(value='Running Epoch 2 of 10'), FloatProgress(value=0.0, max=311.0), HTML(value='')))




HBox(children=(HTML(value='Running Epoch 3 of 10'), FloatProgress(value=0.0, max=311.0), HTML(value='')))




HBox(children=(HTML(value='Running Epoch 4 of 10'), FloatProgress(value=0.0, max=311.0), HTML(value='')))




HBox(children=(HTML(value='Running Epoch 5 of 10'), FloatProgress(value=0.0, max=311.0), HTML(value='')))




HBox(children=(HTML(value='Running Epoch 6 of 10'), FloatProgress(value=0.0, max=311.0), HTML(value='')))




HBox(children=(HTML(value='Running Epoch 7 of 10'), FloatProgress(value=0.0, max=311.0), HTML(value='')))




HBox(children=(HTML(value='Running Epoch 8 of 10'), FloatProgress(value=0.0, max=311.0), HTML(value='')))




HBox(children=(HTML(value='Running Epoch 9 of 10'), FloatProgress(value=0.0, max=311.0), HTML(value='')))





INFO:simpletransformers.classification.classification_model: Training of roberta model complete. Saved to outputs/.


(3110, 0.15086204633098943)

# Prepare the evaluation

In [58]:
test_dataframe = pd.read_csv('./RoBERTa_Text_Classification/Dataset/AppDescriptions_tester.csv')

def remove_non_ascii(text): 
    return ''.join(i for i in text if ord(i)<128) 
 
test_data = []

# Seperating the description from the test data
for i in range(0,len(test_dataframe['description'])):
  if(pd.isna(test_dataframe['description'][i])):
    continue
  temp = remove_non_ascii(test_dataframe['description'][i])
  if(temp == ''):
    continue
  test_data.append(test_dataframe['description'][i])

# Testing on Evaluation Data

In [60]:
# predicting whether the app uses AI or Machine Learning
# the predicted values are stored in the list 'predictions'
#
# Value     Result
# 1         True
# 0         False
predictions, raw_outputs = model.predict(test_data)

# print the apps which uses AI and Machine learning
o = 0
for i in range(0,len(predictions)):
  if(predictions[i] == 0):
    continue
  print('===================================================')
  print('prediction...',predictions[i])
  print(test_data[i])
  o += 1
  print('===================================================')

print(o)
print(len(predictions))

INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=7.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))


prediction... 1
PLEASE NOTE: THIS APP REQUIRES A USERNAME AND PASSWORD PROVIDED BY YOUR ORGANISATION.

BioBase™ App offers you the chance to track, measure and manage your stress.
Our full set of tools don't just teach you how to reduce stress, we use wearables to track your progress and share your data with you in a manageable and understandable way, so you can see what works and what doesn’t work for you.



Features:

- Learn to use Deep Breathing to reduce stress
- Measure Body Stress
- Track your mood and mood triggers
- Take Brain Function tests
- Day by day coaching sessions and exercises
- BioBeam Integration
- Personal dashboard allowing you to:
-- View stress over weeks/months
-- Spot your most stressful days/weeks and begin to pre-empt them
-- Map your most/least stressed locations (uses GPS during Deep Breathing and Body Stress checks)
- We use data from HealthKit to display your number of steps, breathing and heart rate. Your BioBase activity also contributes to your mind

# Preparing the given data

In [17]:
# converting the given data into dataframe
df = pd.read_csv(Data_File_Name)

def remove_non_ascii(text): 
    return ''.join(i for i in text if ord(i)<128) 

Descriptions = [] # list of desctiptions
appids = [] # list of appids corresponding to Descriptions
idx = [] # index of the apps

# Extract the descriptions and remove the non english characters
for i in range(0,len(df['description'])):
  if(pd.isna(df['description'][i])):
    continue
  t = remove_non_ascii(df['description'][i])
  if(t == ''):
    continue
  Descriptions.append(df['description'][i])
  appids.append(df['APP ID'][i])
  idx.append(i) 

In [18]:
# checking all the columns in the data
for col in df.columns: 
    print(col) 

APP ID
id_x
index0_x
index_x
age_restrictions
app_store_url
approx_size_in_bytes
bundle_id
category_id
category_ids
category_name
current_version
icon_url
initial_release_date
last_update_date
name
offers_in_app_purchases
other_stores
permissions
price_cents
publisher_id
publisher_name
publisher_url
screenshot_urls
subcategory_id
subcategory_name
id_y
index0_y
index_y
description
Ratio
Ratio_a
Ratio_b
Avg_DAU
Avg_downloads


In [20]:
# predict the values
predictions, raw_outputs = model.predict(Descriptions)

INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=168214.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=21027.0), HTML(value='')))




In [21]:
id = df['APP ID'][idx[4]]
print(id)
print(appids[4])

# adding an extra column to the dataframe to show the results
df['machine_learning_ai'] = [False] * len(df['APP ID'])

# checking the column names
for col in df.columns:
    print(col)

856686608
856686608
APP ID
id_x
index0_x
index_x
age_restrictions
app_store_url
approx_size_in_bytes
bundle_id
category_id
category_ids
category_name
current_version
icon_url
initial_release_date
last_update_date
name
offers_in_app_purchases
other_stores
permissions
price_cents
publisher_id
publisher_name
publisher_url
screenshot_urls
subcategory_id
subcategory_name
id_y
index0_y
index_y
description
Ratio
Ratio_a
Ratio_b
Avg_DAU
Avg_downloads
machine_learning_ai


In [23]:
# Add the predicted results to the dataframe
for i in range(0,len(appids)):
  if(predictions[i] == 1):
    if(appids[i] == df['APP ID'][idx[i]]):
      df['machine_learning_ai'][idx[i]] = True

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [24]:
# writing the dataframe into a csv file named 'Corrected.csv'
gfg_csv_data = df.to_csv('Corrected.csv', index = False) 

In [36]:
from google.colab import files

# Download the csv file with the results.
files.download('Corrected.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Clean

In [None]:
# Run if there is no space
# Should train the model again after this.
!rm -r output
!rm -r cache_dir