<a href="https://colab.research.google.com/github/SumeetsRoorkee/ML_Code/blob/main/Sentiment_analysis_colab_GPU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
! pip install tqdm
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from tqdm import tqdm
from textblob import TextBlob
import time
import re
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, auc
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
tqdm.pandas()
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import warnings
warnings.filterwarnings("ignore")
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [1]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: Tesla T4


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data.csv')

In [8]:
df.reset_index(inplace = True, drop = True)

In [9]:
df.head()

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral


In [10]:
print(df.iloc[0])

Sentence     The GeoSolutions technology will leverage Bene...
Sentiment                                             positive
Name: 0, dtype: object


In [11]:
possible_labels = df.Sentiment.unique()

In [12]:
labels = {}
for index, possible_label in enumerate(possible_labels):
    labels[possible_label] = index

In [13]:
labels

{'positive': 0, 'negative': 1, 'neutral': 2}

In [14]:
df['labels'] = df.Sentiment.replace(labels)

In [15]:
df.drop('Sentiment', axis=1, inplace=True)

In [16]:
df.head()

Unnamed: 0,Sentence,labels
0,The GeoSolutions technology will leverage Bene...,0
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",1
2,"For the last quarter of 2010 , Componenta 's n...",0
3,According to the Finnish-Russian Chamber of Co...,2
4,The Swedish buyout firm has sold its remaining...,2


In [17]:
df.head()

Unnamed: 0,Sentence,labels
0,The GeoSolutions technology will leverage Bene...,0
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",1
2,"For the last quarter of 2010 , Componenta 's n...",0
3,According to the Finnish-Russian Chamber of Co...,2
4,The Swedish buyout firm has sold its remaining...,2


In [18]:
! pip install -q transformers
from transformers import pipeline

In [19]:
import torch
torch.cuda.is_available()

True

In [20]:
!pip install datasets transformers huggingface_hub



In [21]:
!apt-get install git-lfs

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.3).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


In [22]:
X_train, X_test = train_test_split(df, test_size=0.2, random_state=42)

In [79]:
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)

In [93]:
from datasets import Dataset
train_dataset = Dataset.from_pandas(X_train)
test_dataset = Dataset.from_pandas(X_test)

In [80]:
X_train.head()

Unnamed: 0,Sentence,labels
0,The floor area of the Yliopistonrinne project ...,2
1,"no compensation for its news , opinions or dis...",2
2,RT @ACInvestorBlog $AAPL still on track for $500,0
3,This includes a EUR 39.5 mn change in the fair...,2
4,$GTE LONG at 7.44,0


In [81]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", do_lower_case=True)

In [82]:
def preprocess_function(examples):
    return tokenizer(examples['Sentence'])


In [96]:
tokenized_train = train_dataset.map(preprocess_function)
tokenized_test = test_dataset.map(preprocess_function)

Map:   0%|          | 0/4673 [00:00<?, ? examples/s]

Map:   0%|          | 0/1169 [00:00<?, ? examples/s]

In [97]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [98]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [99]:
import numpy as np
from datasets import load_metric

def compute_metrics(eval_pred):
   load_accuracy = load_metric("accuracy")
   load_f1 = load_metric("f1")

   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
   return {"accuracy": accuracy, "f1": f1}

In [88]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [100]:
from transformers import TrainingArguments, Trainer

repo_name = "finetuning-sentiment-model-3000-samples"

training_args = TrainingArguments(
   output_dir=repo_name,
   learning_rate=2e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=2,
   weight_decay=0.01,
   save_strategy="epoch",
   push_to_hub=True,
)



In [101]:
trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_train,
   eval_dataset=tokenized_test,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)


In [102]:
trainer.train()

Step,Training Loss
500,0.5389


TrainOutput(global_step=586, training_loss=0.5133409467573459, metrics={'train_runtime': 59.1453, 'train_samples_per_second': 158.018, 'train_steps_per_second': 9.908, 'total_flos': 140680816367472.0, 'train_loss': 0.5133409467573459, 'epoch': 2.0})

In [103]:
trainer.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/RoorkeeSumeet/finetuning-sentiment-model-3000-samples/commit/828b3a33fb14c8434fcd5aa7e85e81569030b013', commit_message='End of training', commit_description='', oid='828b3a33fb14c8434fcd5aa7e85e81569030b013', pr_url=None, repo_url=RepoUrl('https://huggingface.co/RoorkeeSumeet/finetuning-sentiment-model-3000-samples', endpoint='https://huggingface.co', repo_type='model', repo_id='RoorkeeSumeet/finetuning-sentiment-model-3000-samples'), pr_revision=None, pr_num=None)

In [104]:
from transformers import pipeline

sentiment_model = pipeline(model="federicopascual/finetuning-sentiment-model-3000-samples")
sentiment_model(['positive', 'negative', 'neutral'])


config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cuda:0


[{'label': 'LABEL_0', 'score': 0.5432145595550537},
 {'label': 'LABEL_0', 'score': 0.7404690980911255},
 {'label': 'LABEL_0', 'score': 0.6129471063613892}]